Python字符串处理：split、join、replace、正则表达式，文本处理的瑞士军刀

字符串处理是日常编程中最常见的任务之一。Python提供了丰富的字符串操作方法，从基本的拼接、切片到强大的正则表达式。本篇将详细介绍字符串的各种操作方法，特别关注在日志分析、报表文本提取等实际场景中的应用。

Python字符串处理

1. 字符串基础回顾

# 字符串创建
s1 = 'Hello'
s2 = "Hello"
s3 = '''多行
字符串'''
s4 = r"原始字符串\n不转义"

# 字符串是不可变的
s = "Hello"
# s[0] = 'h'  # TypeError

# 索引和切片
s = "Hello, World!"
print(s[0])      # 'H'
print(s[-1])     # '!'
print(s[0:5])    # 'Hello'
print(s[7:])     # 'World!'
print(s[::-1])   # '!dlroW ,olleH'

# 字符串拼接
s1 = "Hello"
s2 = "World"
s3 = s1 + " " + s2  # "Hello World"
s4 = " ".join([s1, s2])  # "Hello World"（推荐）

# 字符串重复
s = "Ha" * 3  # "HaHaHa"

# 长度
print(len("Hello"))  # 5

# 成员检测
print("ell" in "Hello")  # True

2. 字符串格式化

2.1 f-string（推荐）

Python 3.6+引入的f-string是最简洁、最强大的格式化方式。

name = "张三"
age = 25
score = 95.5

# 基本用法
print(f"姓名：{name}，年龄：{age}")

# 表达式
print(f"明年{age + 1}岁")
print(f"姓名长度：{len(name)}")

# 调用方法
print(f"大写：{name.upper()}")

# 格式说明符
# 数字格式化
print(f"分数：{score:.2f}")        # 95.50（保留2位小数）
print(f"分数：{score:>10.2f}")     # '     95.50'（右对齐，宽度10）
print(f"分数：{score:<10.2f}")     # '95.50     '（左对齐）
print(f"分数：{score:^10.2f}")     # '  95.50   '（居中）
print(f"分数：{score:010.2f}")     # '0000095.50'（零填充）

# 整数格式化
num = 255
print(f"十进制：{num:d}")          # 255
print(f"二进制：{num:b}")          # 11111111
print(f"八进制：{num:o}")          # 377
print(f"十六进制：{num:x}")        # ff
print(f"十六进制：{num:X}")        # FF
print(f"带前缀：{num:#x}")         # 0xff
print(f"带前缀：{num:#b}")         # 0b11111111

# 千位分隔符
big_num = 1234567890
print(f"金额：{big_num:,}")        # 1,234,567,890
print(f"金额：{big_num:_}")        # 1_234_567_890

# 百分比
ratio = 0.756
print(f"比例：{ratio:.1%}")        # 75.6%

# 对齐和填充
text = "Hi"
print(f"{text:>10}")    # '        Hi'
print(f"{text:<10}")    # 'Hi        '
print(f"{text:^10}")    # '    Hi    '
print(f"{text:*^10}")   # '****Hi****'
print(f"{text:-<10}")   # 'Hi--------'

# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"日期：{now:%Y-%m-%d}")           # 2024-12-18
print(f"时间：{now:%H:%M:%S}")           # 14:30:00
print(f"完整：{now:%Y-%m-%d %H:%M:%S}")  # 2024-12-18 14:30:00

# 调试模式（Python 3.8+）
x = 10
y = 20
print(f"{x=}, {y=}")        # x=10, y=20
print(f"{x + y=}")          # x + y=30

2.2 format方法

# 位置参数
print("{}，{}岁".format("张三", 25))
print("{0}，{1}岁，{0}是学生".format("张三", 25))

# 关键字参数
print("{name}，{age}岁".format(name="张三", age=25))

# 混合使用
print("{0}，{age}岁".format("张三", age=25))

# 格式说明符
print("{:.2f}".format(3.14159))  # 3.14
print("{:>10}".format("Hi"))     # '        Hi'

# 从字典格式化
person = {"name": "张三", "age": 25}
print("{name}，{age}岁".format(**person))

# 访问属性和索引
point = (3, 4)
print("x={0[0]}, y={0[1]}".format(point))

2.3 %格式化（旧式）

# 类似C语言的printf
name = "张三"
age = 25
score = 95.5

print("姓名：%s" % name)
print("年龄：%d" % age)
print("分数：%.2f" % score)
print("姓名：%s，年龄：%d" % (name, age))

# 格式说明符
print("%10s" % "Hi")      # '        Hi'（右对齐）
print("%-10s" % "Hi")     # 'Hi        '（左对齐）
print("%010d" % 42)       # '0000000042'（零填充）
print("%+d" % 42)         # '+42'（显示正号）
print("%x" % 255)         # 'ff'（十六进制）

3. 字符串常用方法

3.1 查找与替换

s = "Hello, World! Hello, Python!"

# 查找
print(s.find("Hello"))       # 0（第一次出现的位置）
print(s.find("Hello", 5))    # 14（从位置5开始找）
print(s.find("Java"))        # -1（找不到返回-1）
print(s.rfind("Hello"))      # 14（从右边开始找）

print(s.index("Hello"))      # 0（找不到会抛出ValueError）
print(s.rindex("Hello"))     # 14

print(s.count("Hello"))      # 2（出现次数）

# 替换
print(s.replace("Hello", "Hi"))           # 替换所有
print(s.replace("Hello", "Hi", 1))        # 只替换第一个

# 检查开头和结尾
print(s.startswith("Hello"))  # True
print(s.endswith("!"))        # True
print(s.startswith(("Hello", "Hi")))  # True（多个前缀）

3.2 分割与连接

# 分割
s = "apple,banana,cherry"
print(s.split(","))          # ['apple', 'banana', 'cherry']
print(s.split(",", 1))       # ['apple', 'banana,cherry']（最多分割1次）

s = "  hello   world  "
print(s.split())             # ['hello', 'world']（按空白分割）

s = "line1\nline2\nline3"
print(s.splitlines())        # ['line1', 'line2', 'line3']

# 从右边分割
s = "a.b.c.d"
print(s.rsplit(".", 1))      # ['a.b.c', 'd']

# 分割并保留分隔符
import re
s = "one1two2three3four"
print(re.split(r'(\d)', s))  # ['one', '1', 'two', '2', 'three', '3', 'four']

# 连接
words = ["apple", "banana", "cherry"]
print(",".join(words))       # "apple,banana,cherry"
print(" ".join(words))       # "apple banana cherry"
print("\n".join(words))      # 多行

# 注意：join只能连接字符串列表
numbers = [1, 2, 3]
# print(",".join(numbers))   # TypeError
print(",".join(map(str, numbers)))  # "1,2,3"

3.3 大小写转换

s = "Hello, World!"

print(s.upper())       # "HELLO, WORLD!"
print(s.lower())       # "hello, world!"
print(s.capitalize())  # "Hello, world!"（首字母大写）
print(s.title())       # "Hello, World!"（每个单词首字母大写）
print(s.swapcase())    # "hELLO, wORLD!"（大小写互换）

# 大小写不敏感比较
s1 = "Hello"
s2 = "hello"
print(s1.lower() == s2.lower())  # True
print(s1.casefold() == s2.casefold())  # True（更强的大小写折叠）

3.4 去除空白与填充

s = "  Hello, World!  "

# 去除空白
print(s.strip())       # "Hello, World!"（两端）
print(s.lstrip())      # "Hello, World!  "（左端）
print(s.rstrip())      # "  Hello, World!"（右端）

# 去除指定字符
s = "###Hello###"
print(s.strip("#"))    # "Hello"

# 填充
s = "Hi"
print(s.center(10))         # "    Hi    "
print(s.center(10, "*"))    # "****Hi****"
print(s.ljust(10))          # "Hi        "
print(s.rjust(10))          # "        Hi"
print(s.zfill(5))           # "000Hi"

# 数字填充
num = "42"
print(num.zfill(5))         # "00042"
print("-42".zfill(5))       # "-0042"（符号在前）

3.5 判断方法

# 内容判断
print("123".isdigit())      # True（全是数字）
print("abc".isalpha())      # True（全是字母）
print("abc123".isalnum())   # True（全是字母或数字）
print("   ".isspace())      # True（全是空白）
print("Hello".istitle())    # True（标题格式）
print("HELLO".isupper())    # True（全大写）
print("hello".islower())    # True（全小写）

# 数字判断的区别
print("123".isdigit())      # True
print("123".isdecimal())    # True
print("123".isnumeric())    # True
print("½".isdigit())        # False
print("½".isnumeric())      # True（包含更多数字字符）

# 标识符判断
print("my_var".isidentifier())   # True（合法的变量名）
print("2var".isidentifier())     # False
print("import".isidentifier())   # True（但是关键字）

import keyword
print(keyword.iskeyword("import"))  # True

4. 正则表达式入门

4.1 什么是正则表达式

正则表达式（Regular Expression，简称regex）是一种强大的文本匹配工具，用于描述字符串的模式。

4.2 基本语法

元字符	说明	示例
`.`	匹配任意字符（除换行）	`a.c` 匹配 “abc”, “a1c”
`^`	匹配字符串开头	`^Hello`
`$`	匹配字符串结尾	`World$`
`*`	匹配0次或多次	`ab*` 匹配 “a”, “ab”, “abb”
`+`	匹配1次或多次	`ab+` 匹配 “ab”, “abb”
`?`	匹配0次或1次	`ab?` 匹配 “a”, “ab”
`{n}`	匹配n次	`a{3}` 匹配 “aaa”
`{n,m}`	匹配n到m次	`a{2,4}` 匹配 “aa”, “aaa”, “aaaa”
`[]`	字符集	`[abc]` 匹配 “a”, “b”, “c”
`[^]`	否定字符集	`[^abc]` 匹配非a/b/c的字符
`\|`	或	`a\|b` 匹配 “a” 或 “b”
`()`	分组	`(ab)+` 匹配 “ab”, “abab”
`\d`	数字 [0-9]	`\d+` 匹配数字串
`\D`	非数字	`\D+` 匹配非数字串
`\w`	单词字符 [a-zA-Z0-9_]	`\w+` 匹配单词
`\W`	非单词字符
`\s`	空白字符	`\s+` 匹配空白
`\S`	非空白字符
`\b`	单词边界	`\bword\b` 匹配完整单词

4.3 re模块常用函数

import re

text = "Hello, my email is [email protected] and phone is 13812345678"

# match：从开头匹配
result = re.match(r"Hello", text)
if result:
    print(result.group())  # "Hello"

# search：搜索第一个匹配
result = re.search(r"\d+", text)
if result:
    print(result.group())  # "13812345678"

# findall：找出所有匹配
numbers = re.findall(r"\d+", text)
print(numbers)  # ['13812345678']

# finditer：返回迭代器
for match in re.finditer(r"\d+", text):
    print(f"找到：{match.group()}，位置：{match.span()}")

# sub：替换
new_text = re.sub(r"\d+", "***", text)
print(new_text)  # "Hello, my email is [email protected] and phone is ***"

# split：分割
parts = re.split(r"\s+", "hello   world  python")
print(parts)  # ['hello', 'world', 'python']

# 编译正则表达式（提高效率）
pattern = re.compile(r"\d+")
result = pattern.findall(text)

# 分组
email_pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(email_pattern, text)
if match:
    print(match.group())   # "[email protected]"
    print(match.group(1))  # "test"
    print(match.group(2))  # "example"
    print(match.group(3))  # "com"
    print(match.groups())  # ('test', 'example', 'com')

# 命名分组
email_pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<suffix>\w+)"
match = re.search(email_pattern, text)
if match:
    print(match.group("user"))    # "test"
    print(match.group("domain"))  # "example"
    print(match.groupdict())      # {'user': 'test', 'domain': 'example', 'suffix': 'com'}

# 标志
# re.IGNORECASE (re.I)：忽略大小写
# re.MULTILINE (re.M)：多行模式
# re.DOTALL (re.S)：.匹配包括换行
# re.VERBOSE (re.X)：允许注释

pattern = re.compile(r"""
    \d{3}    # 区号
    [-\s]?   # 可选的分隔符
    \d{4}    # 前四位
    [-\s]?   # 可选的分隔符
    \d{4}    # 后四位
""", re.VERBOSE)

4.4 实际应用示例

import re

# 1. 验证邮箱
def is_valid_email(email):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return bool(re.match(pattern, email))

print(is_valid_email("[email protected]"))  # True
print(is_valid_email("invalid-email"))     # False

# 2. 验证手机号（中国大陆）
def is_valid_phone(phone):
    pattern = r'^1[3-9]\d{9}$'
    return bool(re.match(pattern, phone))

print(is_valid_phone("13812345678"))  # True
print(is_valid_phone("12345678901"))  # False

# 3. 提取URL
def extract_urls(text):
    pattern = r'https?://[\w\.-]+(?:/[\w\.-]*)*'
    return re.findall(pattern, text)

text = "访问 https://www.example.com 或 http://test.org/page"
print(extract_urls(text))

# 4. 提取IP地址
def extract_ips(text):
    pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
    return re.findall(pattern, text)

log = "来自192.168.1.1的请求，转发到10.0.0.1"
print(extract_ips(log))  # ['192.168.1.1', '10.0.0.1']

# 5. 解析日志时间戳
def parse_log_timestamp(log_line):
    pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]'
    match = re.search(pattern, log_line)
    return match.group(1) if match else None

log = "[2024-12-18 14:30:00] INFO: Application started"
print(parse_log_timestamp(log))  # "2024-12-18 14:30:00"

5. 字符串编码

# Python 3字符串默认是Unicode
s = "你好，世界"
print(type(s))  # <class 'str'>

# 编码：str -> bytes
b = s.encode('utf-8')
print(b)        # b'\xe4\xbd\xa0\xe5\xa5\xbd...'
print(type(b))  # <class 'bytes'>

# 解码：bytes -> str
s2 = b.decode('utf-8')
print(s2)       # "你好，世界"

# 不同编码
s = "你好"
print(s.encode('utf-8'))    # b'\xe4\xbd\xa0\xe5\xa5\xbd'（3字节/字）
print(s.encode('gbk'))      # b'\xc4\xe3\xba\xc3'（2字节/字）
print(s.encode('utf-16'))   # b'\xff\xfe`O}Y'

# 处理编码错误
s = "Hello, 世界"
# 严格模式（默认）
try:
    b = s.encode('ascii')
except UnicodeEncodeError as e:
    print(f"编码错误：{e}")

# 忽略无法编码的字符
b = s.encode('ascii', errors='ignore')
print(b)  # b'Hello, '

# 替换无法编码的字符
b = s.encode('ascii', errors='replace')
print(b)  # b'Hello, ??'

# 读取文件时指定编码
with open('file.txt', 'r', encoding='utf-8') as f:
    content = f.read()

6. 实际应用场景

6.1 日志解析

import re
from datetime import datetime

def parse_log_line(line):
    """
    解析日志行，提取时间、级别、消息
    格式：[2024-12-18 14:30:00] [INFO] Message here
    """
    pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] \[(\w+)\] (.+)'
    match = re.match(pattern, line)
    if match:
        timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
        level = match.group(2)
        message = match.group(3)
        return {'timestamp': timestamp, 'level': level, 'message': message}
    return None

# 示例
log_lines = [
    "[2024-12-18 14:30:00] [INFO] Application started",
    "[2024-12-18 14:30:01] [ERROR] Connection failed",
    "[2024-12-18 14:30:02] [WARNING] Low memory",
]

for line in log_lines:
    result = parse_log_line(line)
    if result:
        print(f"{result['level']}: {result['message']}")

6.2 文本提取

import re

def extract_data_from_report(report):
    """从报告文本中提取关键数据"""
    data = {}
    
    # 提取日期
    date_match = re.search(r'日期[：:]\s*(\d{4}[-/]\d{2}[-/]\d{2})', report)
    if date_match:
        data['date'] = date_match.group(1)
    
    # 提取金额
    amount_match = re.search(r'金额[：:]\s*[\￥\$]?([\d,]+\.?\d*)', report)
    if amount_match:
        data['amount'] = float(amount_match.group(1).replace(',', ''))
    
    # 提取百分比
    percent_match = re.search(r'增长率[：:]\s*([\d.]+)%', report)
    if percent_match:
        data['growth_rate'] = float(percent_match.group(1))
    
    return data

report = """
月度销售报告
日期：2024-12-18
总金额：￥1,234,567.89
同比增长率：15.5%
"""

print(extract_data_from_report(report))
# {'date': '2024-12-18', 'amount': 1234567.89, 'growth_rate': 15.5}

6.3 数据清洗

import re

def clean_text(text):
    """清洗文本数据"""
    # 去除多余空白
    text = re.sub(r'\s+', ' ', text)
    
    # 去除特殊字符
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
    
    # 去除首尾空白
    text = text.strip()
    
    return text

def normalize_phone(phone):
    """标准化电话号码"""
    # 去除所有非数字字符
    digits = re.sub(r'\D', '', phone)
    
    # 格式化为标准格式
    if len(digits) == 11:
        return f"{digits[:3]}-{digits[3:7]}-{digits[7:]}"
    return digits

def extract_chinese(text):
    """提取中文字符"""
    return ''.join(re.findall(r'[\u4e00-\u9fff]+', text))

# 测试
print(clean_text("  Hello   World!  @#$  你好  "))
print(normalize_phone("138-1234-5678"))
print(normalize_phone("(138) 1234 5678"))
print(extract_chinese("Hello你好World世界"))

7. 常见错误与避坑

❌ 错误1：忘记使用原始字符串

import re

# 错误：反斜杠被转义
# pattern = "\d+"  # 实际是 "d+"

# 正确：使用原始字符串
pattern = r"\d+"

❌ 错误2：字符串不可变

s = "Hello"

# 错误：尝试修改字符串
# s[0] = 'h'  # TypeError

# 正确：创建新字符串
s = 'h' + s[1:]

❌ 错误3：编码问题

# 错误：不指定编码读取文件
# with open('file.txt') as f:  # 可能乱码
#     content = f.read()

# 正确：指定编码
with open('file.txt', encoding='utf-8') as f:
    content = f.read()

❌ 错误4：低效的字符串拼接

# 错误：循环中使用+拼接（低效）
result = ""
for i in range(1000):
    result += str(i)

# 正确：使用join（高效）
result = "".join(str(i) for i in range(1000))

# 或使用列表
parts = []
for i in range(1000):
    parts.append(str(i))
result = "".join(parts)

8. 实战练习

练习1：解析Vivado日志

"""
练习：解析Vivado时序报告，提取关键信息
"""
import re

def parse_timing_report(report):
    """解析Vivado时序报告"""
    results = {}
    
    # 提取WNS (Worst Negative Slack)
    wns_match = re.search(r'WNS\s*[：:=]\s*([-\d.]+)\s*ns', report)
    if wns_match:
        results['wns'] = float(wns_match.group(1))
    
    # 提取TNS (Total Negative Slack)
    tns_match = re.search(r'TNS\s*[：:=]\s*([-\d.]+)\s*ns', report)
    if tns_match:
        results['tns'] = float(tns_match.group(1))
    
    # 提取时钟频率
    freq_match = re.search(r'(\d+\.?\d*)\s*MHz', report)
    if freq_match:
        results['frequency'] = float(freq_match.group(1))
    
    return results

# 测试
report = """
Timing Summary
WNS: 0.123 ns
TNS: 0.000 ns
Clock: clk_100 @ 100.0 MHz
"""
print(parse_timing_report(report))

练习2：批量重命名

"""
练习：批量重命名文件名中的日期格式
"""
import re

def rename_date_format(filename):
    """将文件名中的日期从 MM-DD-YYYY 转换为 YYYY-MM-DD"""
    pattern = r'(\d{2})-(\d{2})-(\d{4})'
    
    def replacer(match):
        month, day, year = match.groups()
        return f"{year}-{month}-{day}"
    
    return re.sub(pattern, replacer, filename)

# 测试
filenames = [
    "report_12-18-2024.xlsx",
    "data_01-05-2024_backup.csv",
    "log_11-30-2024.txt"
]

for name in filenames:
    print(f"{name} -> {rename_date_format(name)}")

9. 总结

🔑 核心要点

知识点	要点
f-string	最推荐的格式化方式，支持表达式和格式说明符
字符串方法	find/replace/split/join/strip等
正则表达式	强大的模式匹配工具，re模块
编码	Python 3默认Unicode，注意文件编码
性能	大量拼接用join，避免循环中用+

✅ 学习检查清单

掌握f-string格式化
熟练使用字符串常用方法
理解正则表达式基本语法
能使用re模块进行文本匹配和提取
理解字符串编码问题

📖 下一步学习

掌握了字符串处理后，让我们学习Python的文件操作：

常见问题 FAQ

💬 正则表达式字符串前面为什么要加r？

r""是原始字符串，不对\做转义。正则表达式充满反斜杠，不加r的话\d会被Python解释为转义字符而不是正则元字符。养成习惯：正则表达式一律用r""。

💬 字符串拼接用+还是join？

少量拼接用+或f-string。大量拼接（如循环中）必须用"".join(list)，因为+每次都创建新字符串，性能差距可达100倍。

� 系列导航

上一篇：08 - Python函数基础

当前：09 - Python字符串处理

下一篇：10 - Python文件操作