Veloris.
返回索引
概念基础 2026-02-14

Python字符串处理:split、join、replace、正则表达式,文本处理的瑞士军刀

3 分钟
961 words

Python字符串处理:split、join、replace、正则表达式,文本处理的瑞士军刀

字符串处理是日常编程中最常见的任务之一。Python提供了丰富的字符串操作方法,从基本的拼接、切片到强大的正则表达式。本篇将详细介绍字符串的各种操作方法,特别关注在日志分析、报表文本提取等实际场景中的应用。


1. 字符串基础回顾

# 字符串创建
s1 = 'Hello'
s2 = "Hello"
s3 = '''多行
字符串'''
s4 = r"原始字符串\n不转义"

# 字符串是不可变的
s = "Hello"
# s[0] = 'h'  # TypeError

# 索引和切片
s = "Hello, World!"
print(s[0])      # 'H'
print(s[-1])     # '!'
print(s[0:5])    # 'Hello'
print(s[7:])     # 'World!'
print(s[::-1])   # '!dlroW ,olleH'

# 字符串拼接
s1 = "Hello"
s2 = "World"
s3 = s1 + " " + s2  # "Hello World"
s4 = " ".join([s1, s2])  # "Hello World"(推荐)

# 字符串重复
s = "Ha" * 3  # "HaHaHa"

# 长度
print(len("Hello"))  # 5

# 成员检测
print("ell" in "Hello")  # True

2. 字符串格式化

2.1 f-string(推荐)

Python 3.6+引入的f-string是最简洁、最强大的格式化方式。

name = "张三"
age = 25
score = 95.5

# 基本用法
print(f"姓名:{name},年龄:{age}")

# 表达式
print(f"明年{age + 1}岁")
print(f"姓名长度:{len(name)}")

# 调用方法
print(f"大写:{name.upper()}")

# 格式说明符
# 数字格式化
print(f"分数:{score:.2f}")        # 95.50(保留2位小数)
print(f"分数:{score:>10.2f}")     # '     95.50'(右对齐,宽度10)
print(f"分数:{score:<10.2f}")     # '95.50     '(左对齐)
print(f"分数:{score:^10.2f}")     # '  95.50   '(居中)
print(f"分数:{score:010.2f}")     # '0000095.50'(零填充)

# 整数格式化
num = 255
print(f"十进制:{num:d}")          # 255
print(f"二进制:{num:b}")          # 11111111
print(f"八进制:{num:o}")          # 377
print(f"十六进制:{num:x}")        # ff
print(f"十六进制:{num:X}")        # FF
print(f"带前缀:{num:#x}")         # 0xff
print(f"带前缀:{num:#b}")         # 0b11111111

# 千位分隔符
big_num = 1234567890
print(f"金额:{big_num:,}")        # 1,234,567,890
print(f"金额:{big_num:_}")        # 1_234_567_890

# 百分比
ratio = 0.756
print(f"比例:{ratio:.1%}")        # 75.6%

# 对齐和填充
text = "Hi"
print(f"{text:>10}")    # '        Hi'
print(f"{text:<10}")    # 'Hi        '
print(f"{text:^10}")    # '    Hi    '
print(f"{text:*^10}")   # '****Hi****'
print(f"{text:-<10}")   # 'Hi--------'

# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"日期:{now:%Y-%m-%d}")           # 2024-12-18
print(f"时间:{now:%H:%M:%S}")           # 14:30:00
print(f"完整:{now:%Y-%m-%d %H:%M:%S}")  # 2024-12-18 14:30:00

# 调试模式(Python 3.8+)
x = 10
y = 20
print(f"{x=}, {y=}")        # x=10, y=20
print(f"{x + y=}")          # x + y=30

2.2 format方法

# 位置参数
print("{}{}岁".format("张三", 25))
print("{0}{1}岁,{0}是学生".format("张三", 25))

# 关键字参数
print("{name}{age}岁".format(name="张三", age=25))

# 混合使用
print("{0}{age}岁".format("张三", age=25))

# 格式说明符
print("{:.2f}".format(3.14159))  # 3.14
print("{:>10}".format("Hi"))     # '        Hi'

# 从字典格式化
person = {"name": "张三", "age": 25}
print("{name}{age}岁".format(**person))

# 访问属性和索引
point = (3, 4)
print("x={0[0]}, y={0[1]}".format(point))

2.3 %格式化(旧式)

# 类似C语言的printf
name = "张三"
age = 25
score = 95.5

print("姓名:%s" % name)
print("年龄:%d" % age)
print("分数:%.2f" % score)
print("姓名:%s,年龄:%d" % (name, age))

# 格式说明符
print("%10s" % "Hi")      # '        Hi'(右对齐)
print("%-10s" % "Hi")     # 'Hi        '(左对齐)
print("%010d" % 42)       # '0000000042'(零填充)
print("%+d" % 42)         # '+42'(显示正号)
print("%x" % 255)         # 'ff'(十六进制)

3. 字符串常用方法

3.1 查找与替换

s = "Hello, World! Hello, Python!"

# 查找
print(s.find("Hello"))       # 0(第一次出现的位置)
print(s.find("Hello", 5))    # 14(从位置5开始找)
print(s.find("Java"))        # -1(找不到返回-1)
print(s.rfind("Hello"))      # 14(从右边开始找)

print(s.index("Hello"))      # 0(找不到会抛出ValueError)
print(s.rindex("Hello"))     # 14

print(s.count("Hello"))      # 2(出现次数)

# 替换
print(s.replace("Hello", "Hi"))           # 替换所有
print(s.replace("Hello", "Hi", 1))        # 只替换第一个

# 检查开头和结尾
print(s.startswith("Hello"))  # True
print(s.endswith("!"))        # True
print(s.startswith(("Hello", "Hi")))  # True(多个前缀)

3.2 分割与连接

# 分割
s = "apple,banana,cherry"
print(s.split(","))          # ['apple', 'banana', 'cherry']
print(s.split(",", 1))       # ['apple', 'banana,cherry'](最多分割1次)

s = "  hello   world  "
print(s.split())             # ['hello', 'world'](按空白分割)

s = "line1\nline2\nline3"
print(s.splitlines())        # ['line1', 'line2', 'line3']

# 从右边分割
s = "a.b.c.d"
print(s.rsplit(".", 1))      # ['a.b.c', 'd']

# 分割并保留分隔符
import re
s = "one1two2three3four"
print(re.split(r'(\d)', s))  # ['one', '1', 'two', '2', 'three', '3', 'four']

# 连接
words = ["apple", "banana", "cherry"]
print(",".join(words))       # "apple,banana,cherry"
print(" ".join(words))       # "apple banana cherry"
print("\n".join(words))      # 多行

# 注意:join只能连接字符串列表
numbers = [1, 2, 3]
# print(",".join(numbers))   # TypeError
print(",".join(map(str, numbers)))  # "1,2,3"

3.3 大小写转换

s = "Hello, World!"

print(s.upper())       # "HELLO, WORLD!"
print(s.lower())       # "hello, world!"
print(s.capitalize())  # "Hello, world!"(首字母大写)
print(s.title())       # "Hello, World!"(每个单词首字母大写)
print(s.swapcase())    # "hELLO, wORLD!"(大小写互换)

# 大小写不敏感比较
s1 = "Hello"
s2 = "hello"
print(s1.lower() == s2.lower())  # True
print(s1.casefold() == s2.casefold())  # True(更强的大小写折叠)

3.4 去除空白与填充

s = "  Hello, World!  "

# 去除空白
print(s.strip())       # "Hello, World!"(两端)
print(s.lstrip())      # "Hello, World!  "(左端)
print(s.rstrip())      # "  Hello, World!"(右端)

# 去除指定字符
s = "###Hello###"
print(s.strip("#"))    # "Hello"

# 填充
s = "Hi"
print(s.center(10))         # "    Hi    "
print(s.center(10, "*"))    # "****Hi****"
print(s.ljust(10))          # "Hi        "
print(s.rjust(10))          # "        Hi"
print(s.zfill(5))           # "000Hi"

# 数字填充
num = "42"
print(num.zfill(5))         # "00042"
print("-42".zfill(5))       # "-0042"(符号在前)

3.5 判断方法

# 内容判断
print("123".isdigit())      # True(全是数字)
print("abc".isalpha())      # True(全是字母)
print("abc123".isalnum())   # True(全是字母或数字)
print("   ".isspace())      # True(全是空白)
print("Hello".istitle())    # True(标题格式)
print("HELLO".isupper())    # True(全大写)
print("hello".islower())    # True(全小写)

# 数字判断的区别
print("123".isdigit())      # True
print("123".isdecimal())    # True
print("123".isnumeric())    # True
print("½".isdigit())        # False
print("½".isnumeric())      # True(包含更多数字字符)

# 标识符判断
print("my_var".isidentifier())   # True(合法的变量名)
print("2var".isidentifier())     # False
print("import".isidentifier())   # True(但是关键字)

import keyword
print(keyword.iskeyword("import"))  # True

4. 正则表达式入门

4.1 什么是正则表达式

正则表达式(Regular Expression,简称regex)是一种强大的文本匹配工具,用于描述字符串的模式。

4.2 基本语法

元字符说明示例
.匹配任意字符(除换行)a.c 匹配 “abc”, “a1c”
^匹配字符串开头^Hello
$匹配字符串结尾World$
*匹配0次或多次ab* 匹配 “a”, “ab”, “abb”
+匹配1次或多次ab+ 匹配 “ab”, “abb”
?匹配0次或1次ab? 匹配 “a”, “ab”
{n}匹配n次a{3} 匹配 “aaa”
{n,m}匹配n到m次a{2,4} 匹配 “aa”, “aaa”, “aaaa”
[]字符集[abc] 匹配 “a”, “b”, “c”
[^]否定字符集[^abc] 匹配非a/b/c的字符
|a|b 匹配 “a” 或 “b”
()分组(ab)+ 匹配 “ab”, “abab”
\d数字 [0-9]\d+ 匹配数字串
\D非数字\D+ 匹配非数字串
\w单词字符 [a-zA-Z0-9_]\w+ 匹配单词
\W非单词字符
\s空白字符\s+ 匹配空白
\S非空白字符
\b单词边界\bword\b 匹配完整单词

4.3 re模块常用函数

import re

text = "Hello, my email is [email protected] and phone is 13812345678"

# match:从开头匹配
result = re.match(r"Hello", text)
if result:
    print(result.group())  # "Hello"

# search:搜索第一个匹配
result = re.search(r"\d+", text)
if result:
    print(result.group())  # "13812345678"

# findall:找出所有匹配
numbers = re.findall(r"\d+", text)
print(numbers)  # ['13812345678']

# finditer:返回迭代器
for match in re.finditer(r"\d+", text):
    print(f"找到:{match.group()},位置:{match.span()}")

# sub:替换
new_text = re.sub(r"\d+", "***", text)
print(new_text)  # "Hello, my email is [email protected] and phone is ***"

# split:分割
parts = re.split(r"\s+", "hello   world  python")
print(parts)  # ['hello', 'world', 'python']

# 编译正则表达式(提高效率)
pattern = re.compile(r"\d+")
result = pattern.findall(text)

# 分组
email_pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(email_pattern, text)
if match:
    print(match.group())   # "[email protected]"
    print(match.group(1))  # "test"
    print(match.group(2))  # "example"
    print(match.group(3))  # "com"
    print(match.groups())  # ('test', 'example', 'com')

# 命名分组
email_pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<suffix>\w+)"
match = re.search(email_pattern, text)
if match:
    print(match.group("user"))    # "test"
    print(match.group("domain"))  # "example"
    print(match.groupdict())      # {'user': 'test', 'domain': 'example', 'suffix': 'com'}

# 标志
# re.IGNORECASE (re.I):忽略大小写
# re.MULTILINE (re.M):多行模式
# re.DOTALL (re.S):.匹配包括换行
# re.VERBOSE (re.X):允许注释

pattern = re.compile(r"""
    \d{3}    # 区号
    [-\s]?   # 可选的分隔符
    \d{4}    # 前四位
    [-\s]?   # 可选的分隔符
    \d{4}    # 后四位
""", re.VERBOSE)

4.4 实际应用示例

import re

# 1. 验证邮箱
def is_valid_email(email):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return bool(re.match(pattern, email))

print(is_valid_email("[email protected]"))  # True
print(is_valid_email("invalid-email"))     # False

# 2. 验证手机号(中国大陆)
def is_valid_phone(phone):
    pattern = r'^1[3-9]\d{9}$'
    return bool(re.match(pattern, phone))

print(is_valid_phone("13812345678"))  # True
print(is_valid_phone("12345678901"))  # False

# 3. 提取URL
def extract_urls(text):
    pattern = r'https?://[\w\.-]+(?:/[\w\.-]*)*'
    return re.findall(pattern, text)

text = "访问 https://www.example.com 或 http://test.org/page"
print(extract_urls(text))

# 4. 提取IP地址
def extract_ips(text):
    pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
    return re.findall(pattern, text)

log = "来自192.168.1.1的请求,转发到10.0.0.1"
print(extract_ips(log))  # ['192.168.1.1', '10.0.0.1']

# 5. 解析日志时间戳
def parse_log_timestamp(log_line):
    pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]'
    match = re.search(pattern, log_line)
    return match.group(1) if match else None

log = "[2024-12-18 14:30:00] INFO: Application started"
print(parse_log_timestamp(log))  # "2024-12-18 14:30:00"

5. 字符串编码

# Python 3字符串默认是Unicode
s = "你好,世界"
print(type(s))  # <class 'str'>

# 编码:str -> bytes
b = s.encode('utf-8')
print(b)        # b'\xe4\xbd\xa0\xe5\xa5\xbd...'
print(type(b))  # <class 'bytes'>

# 解码:bytes -> str
s2 = b.decode('utf-8')
print(s2)       # "你好,世界"

# 不同编码
s = "你好"
print(s.encode('utf-8'))    # b'\xe4\xbd\xa0\xe5\xa5\xbd'(3字节/字)
print(s.encode('gbk'))      # b'\xc4\xe3\xba\xc3'(2字节/字)
print(s.encode('utf-16'))   # b'\xff\xfe`O}Y'

# 处理编码错误
s = "Hello, 世界"
# 严格模式(默认)
try:
    b = s.encode('ascii')
except UnicodeEncodeError as e:
    print(f"编码错误:{e}")

# 忽略无法编码的字符
b = s.encode('ascii', errors='ignore')
print(b)  # b'Hello, '

# 替换无法编码的字符
b = s.encode('ascii', errors='replace')
print(b)  # b'Hello, ??'

# 读取文件时指定编码
with open('file.txt', 'r', encoding='utf-8') as f:
    content = f.read()

6. 实际应用场景

6.1 日志解析

import re
from datetime import datetime

def parse_log_line(line):
    """
    解析日志行,提取时间、级别、消息
    格式:[2024-12-18 14:30:00] [INFO] Message here
    """
    pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] \[(\w+)\] (.+)'
    match = re.match(pattern, line)
    if match:
        timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
        level = match.group(2)
        message = match.group(3)
        return {'timestamp': timestamp, 'level': level, 'message': message}
    return None

# 示例
log_lines = [
    "[2024-12-18 14:30:00] [INFO] Application started",
    "[2024-12-18 14:30:01] [ERROR] Connection failed",
    "[2024-12-18 14:30:02] [WARNING] Low memory",
]

for line in log_lines:
    result = parse_log_line(line)
    if result:
        print(f"{result['level']}: {result['message']}")

6.2 文本提取

import re

def extract_data_from_report(report):
    """从报告文本中提取关键数据"""
    data = {}
    
    # 提取日期
    date_match = re.search(r'日期[::]\s*(\d{4}[-/]\d{2}[-/]\d{2})', report)
    if date_match:
        data['date'] = date_match.group(1)
    
    # 提取金额
    amount_match = re.search(r'金额[::]\s*[\¥\$]?([\d,]+\.?\d*)', report)
    if amount_match:
        data['amount'] = float(amount_match.group(1).replace(',', ''))
    
    # 提取百分比
    percent_match = re.search(r'增长率[::]\s*([\d.]+)%', report)
    if percent_match:
        data['growth_rate'] = float(percent_match.group(1))
    
    return data

report = """
月度销售报告
日期:2024-12-18
总金额:¥1,234,567.89
同比增长率:15.5%
"""

print(extract_data_from_report(report))
# {'date': '2024-12-18', 'amount': 1234567.89, 'growth_rate': 15.5}

6.3 数据清洗

import re

def clean_text(text):
    """清洗文本数据"""
    # 去除多余空白
    text = re.sub(r'\s+', ' ', text)
    
    # 去除特殊字符
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
    
    # 去除首尾空白
    text = text.strip()
    
    return text

def normalize_phone(phone):
    """标准化电话号码"""
    # 去除所有非数字字符
    digits = re.sub(r'\D', '', phone)
    
    # 格式化为标准格式
    if len(digits) == 11:
        return f"{digits[:3]}-{digits[3:7]}-{digits[7:]}"
    return digits

def extract_chinese(text):
    """提取中文字符"""
    return ''.join(re.findall(r'[\u4e00-\u9fff]+', text))

# 测试
print(clean_text("  Hello   World!  @#$  你好  "))
print(normalize_phone("138-1234-5678"))
print(normalize_phone("(138) 1234 5678"))
print(extract_chinese("Hello你好World世界"))

7. 常见错误与避坑

❌ 错误1:忘记使用原始字符串

import re

# 错误:反斜杠被转义
# pattern = "\d+"  # 实际是 "d+"

# 正确:使用原始字符串
pattern = r"\d+"

❌ 错误2:字符串不可变

s = "Hello"

# 错误:尝试修改字符串
# s[0] = 'h'  # TypeError

# 正确:创建新字符串
s = 'h' + s[1:]

❌ 错误3:编码问题

# 错误:不指定编码读取文件
# with open('file.txt') as f:  # 可能乱码
#     content = f.read()

# 正确:指定编码
with open('file.txt', encoding='utf-8') as f:
    content = f.read()

❌ 错误4:低效的字符串拼接

# 错误:循环中使用+拼接(低效)
result = ""
for i in range(1000):
    result += str(i)

# 正确:使用join(高效)
result = "".join(str(i) for i in range(1000))

# 或使用列表
parts = []
for i in range(1000):
    parts.append(str(i))
result = "".join(parts)

8. 实战练习

练习1:解析Vivado日志

"""
练习:解析Vivado时序报告,提取关键信息
"""
import re

def parse_timing_report(report):
    """解析Vivado时序报告"""
    results = {}
    
    # 提取WNS (Worst Negative Slack)
    wns_match = re.search(r'WNS\s*[::=]\s*([-\d.]+)\s*ns', report)
    if wns_match:
        results['wns'] = float(wns_match.group(1))
    
    # 提取TNS (Total Negative Slack)
    tns_match = re.search(r'TNS\s*[::=]\s*([-\d.]+)\s*ns', report)
    if tns_match:
        results['tns'] = float(tns_match.group(1))
    
    # 提取时钟频率
    freq_match = re.search(r'(\d+\.?\d*)\s*MHz', report)
    if freq_match:
        results['frequency'] = float(freq_match.group(1))
    
    return results

# 测试
report = """
Timing Summary
WNS: 0.123 ns
TNS: 0.000 ns
Clock: clk_100 @ 100.0 MHz
"""
print(parse_timing_report(report))

练习2:批量重命名

"""
练习:批量重命名文件名中的日期格式
"""
import re

def rename_date_format(filename):
    """将文件名中的日期从 MM-DD-YYYY 转换为 YYYY-MM-DD"""
    pattern = r'(\d{2})-(\d{2})-(\d{4})'
    
    def replacer(match):
        month, day, year = match.groups()
        return f"{year}-{month}-{day}"
    
    return re.sub(pattern, replacer, filename)

# 测试
filenames = [
    "report_12-18-2024.xlsx",
    "data_01-05-2024_backup.csv",
    "log_11-30-2024.txt"
]

for name in filenames:
    print(f"{name} -> {rename_date_format(name)}")

9. 总结

🔑 核心要点

知识点要点
f-string最推荐的格式化方式,支持表达式和格式说明符
字符串方法find/replace/split/join/strip等
正则表达式强大的模式匹配工具,re模块
编码Python 3默认Unicode,注意文件编码
性能大量拼接用join,避免循环中用+

✅ 学习检查清单

  • 掌握f-string格式化
  • 熟练使用字符串常用方法
  • 理解正则表达式基本语法
  • 能使用re模块进行文本匹配和提取
  • 理解字符串编码问题

📖 下一步学习

掌握了字符串处理后,让我们学习Python的文件操作:


常见问题 FAQ

💬 正则表达式字符串前面为什么要加r?

r""是原始字符串,不对\做转义。正则表达式充满反斜杠,不加r的话\d会被Python解释为转义字符而不是正则元字符。养成习惯:正则表达式一律用r""

💬 字符串拼接用+还是join?

少量拼接用+或f-string。大量拼接(如循环中)必须用"".join(list),因为+每次都创建新字符串,性能差距可达100倍。


系列导航

End of file.