Python字符串处理:split、join、replace、正则表达式,文本处理的瑞士军刀
字符串处理是日常编程中最常见的任务之一。Python提供了丰富的字符串操作方法,从基本的拼接、切片到强大的正则表达式。本篇将详细介绍字符串的各种操作方法,特别关注在日志分析、报表文本提取等实际场景中的应用。
1. 字符串基础回顾
# 字符串创建
s1 = 'Hello'
s2 = "Hello"
s3 = '''多行
字符串'''
s4 = r"原始字符串\n不转义"
# 字符串是不可变的
s = "Hello"
# s[0] = 'h' # TypeError
# 索引和切片
s = "Hello, World!"
print(s[0]) # 'H'
print(s[-1]) # '!'
print(s[0:5]) # 'Hello'
print(s[7:]) # 'World!'
print(s[::-1]) # '!dlroW ,olleH'
# 字符串拼接
s1 = "Hello"
s2 = "World"
s3 = s1 + " " + s2 # "Hello World"
s4 = " ".join([s1, s2]) # "Hello World"(推荐)
# 字符串重复
s = "Ha" * 3 # "HaHaHa"
# 长度
print(len("Hello")) # 5
# 成员检测
print("ell" in "Hello") # True
2. 字符串格式化
2.1 f-string(推荐)
Python 3.6+引入的f-string是最简洁、最强大的格式化方式。
name = "张三"
age = 25
score = 95.5
# 基本用法
print(f"姓名:{name},年龄:{age}")
# 表达式
print(f"明年{age + 1}岁")
print(f"姓名长度:{len(name)}")
# 调用方法
print(f"大写:{name.upper()}")
# 格式说明符
# 数字格式化
print(f"分数:{score:.2f}") # 95.50(保留2位小数)
print(f"分数:{score:>10.2f}") # ' 95.50'(右对齐,宽度10)
print(f"分数:{score:<10.2f}") # '95.50 '(左对齐)
print(f"分数:{score:^10.2f}") # ' 95.50 '(居中)
print(f"分数:{score:010.2f}") # '0000095.50'(零填充)
# 整数格式化
num = 255
print(f"十进制:{num:d}") # 255
print(f"二进制:{num:b}") # 11111111
print(f"八进制:{num:o}") # 377
print(f"十六进制:{num:x}") # ff
print(f"十六进制:{num:X}") # FF
print(f"带前缀:{num:#x}") # 0xff
print(f"带前缀:{num:#b}") # 0b11111111
# 千位分隔符
big_num = 1234567890
print(f"金额:{big_num:,}") # 1,234,567,890
print(f"金额:{big_num:_}") # 1_234_567_890
# 百分比
ratio = 0.756
print(f"比例:{ratio:.1%}") # 75.6%
# 对齐和填充
text = "Hi"
print(f"{text:>10}") # ' Hi'
print(f"{text:<10}") # 'Hi '
print(f"{text:^10}") # ' Hi '
print(f"{text:*^10}") # '****Hi****'
print(f"{text:-<10}") # 'Hi--------'
# 日期格式化
from datetime import datetime
now = datetime.now()
print(f"日期:{now:%Y-%m-%d}") # 2024-12-18
print(f"时间:{now:%H:%M:%S}") # 14:30:00
print(f"完整:{now:%Y-%m-%d %H:%M:%S}") # 2024-12-18 14:30:00
# 调试模式(Python 3.8+)
x = 10
y = 20
print(f"{x=}, {y=}") # x=10, y=20
print(f"{x + y=}") # x + y=30
2.2 format方法
# 位置参数
print("{},{}岁".format("张三", 25))
print("{0},{1}岁,{0}是学生".format("张三", 25))
# 关键字参数
print("{name},{age}岁".format(name="张三", age=25))
# 混合使用
print("{0},{age}岁".format("张三", age=25))
# 格式说明符
print("{:.2f}".format(3.14159)) # 3.14
print("{:>10}".format("Hi")) # ' Hi'
# 从字典格式化
person = {"name": "张三", "age": 25}
print("{name},{age}岁".format(**person))
# 访问属性和索引
point = (3, 4)
print("x={0[0]}, y={0[1]}".format(point))
2.3 %格式化(旧式)
# 类似C语言的printf
name = "张三"
age = 25
score = 95.5
print("姓名:%s" % name)
print("年龄:%d" % age)
print("分数:%.2f" % score)
print("姓名:%s,年龄:%d" % (name, age))
# 格式说明符
print("%10s" % "Hi") # ' Hi'(右对齐)
print("%-10s" % "Hi") # 'Hi '(左对齐)
print("%010d" % 42) # '0000000042'(零填充)
print("%+d" % 42) # '+42'(显示正号)
print("%x" % 255) # 'ff'(十六进制)
3. 字符串常用方法
3.1 查找与替换
s = "Hello, World! Hello, Python!"
# 查找
print(s.find("Hello")) # 0(第一次出现的位置)
print(s.find("Hello", 5)) # 14(从位置5开始找)
print(s.find("Java")) # -1(找不到返回-1)
print(s.rfind("Hello")) # 14(从右边开始找)
print(s.index("Hello")) # 0(找不到会抛出ValueError)
print(s.rindex("Hello")) # 14
print(s.count("Hello")) # 2(出现次数)
# 替换
print(s.replace("Hello", "Hi")) # 替换所有
print(s.replace("Hello", "Hi", 1)) # 只替换第一个
# 检查开头和结尾
print(s.startswith("Hello")) # True
print(s.endswith("!")) # True
print(s.startswith(("Hello", "Hi"))) # True(多个前缀)
3.2 分割与连接
# 分割
s = "apple,banana,cherry"
print(s.split(",")) # ['apple', 'banana', 'cherry']
print(s.split(",", 1)) # ['apple', 'banana,cherry'](最多分割1次)
s = " hello world "
print(s.split()) # ['hello', 'world'](按空白分割)
s = "line1\nline2\nline3"
print(s.splitlines()) # ['line1', 'line2', 'line3']
# 从右边分割
s = "a.b.c.d"
print(s.rsplit(".", 1)) # ['a.b.c', 'd']
# 分割并保留分隔符
import re
s = "one1two2three3four"
print(re.split(r'(\d)', s)) # ['one', '1', 'two', '2', 'three', '3', 'four']
# 连接
words = ["apple", "banana", "cherry"]
print(",".join(words)) # "apple,banana,cherry"
print(" ".join(words)) # "apple banana cherry"
print("\n".join(words)) # 多行
# 注意:join只能连接字符串列表
numbers = [1, 2, 3]
# print(",".join(numbers)) # TypeError
print(",".join(map(str, numbers))) # "1,2,3"
3.3 大小写转换
s = "Hello, World!"
print(s.upper()) # "HELLO, WORLD!"
print(s.lower()) # "hello, world!"
print(s.capitalize()) # "Hello, world!"(首字母大写)
print(s.title()) # "Hello, World!"(每个单词首字母大写)
print(s.swapcase()) # "hELLO, wORLD!"(大小写互换)
# 大小写不敏感比较
s1 = "Hello"
s2 = "hello"
print(s1.lower() == s2.lower()) # True
print(s1.casefold() == s2.casefold()) # True(更强的大小写折叠)
3.4 去除空白与填充
s = " Hello, World! "
# 去除空白
print(s.strip()) # "Hello, World!"(两端)
print(s.lstrip()) # "Hello, World! "(左端)
print(s.rstrip()) # " Hello, World!"(右端)
# 去除指定字符
s = "###Hello###"
print(s.strip("#")) # "Hello"
# 填充
s = "Hi"
print(s.center(10)) # " Hi "
print(s.center(10, "*")) # "****Hi****"
print(s.ljust(10)) # "Hi "
print(s.rjust(10)) # " Hi"
print(s.zfill(5)) # "000Hi"
# 数字填充
num = "42"
print(num.zfill(5)) # "00042"
print("-42".zfill(5)) # "-0042"(符号在前)
3.5 判断方法
# 内容判断
print("123".isdigit()) # True(全是数字)
print("abc".isalpha()) # True(全是字母)
print("abc123".isalnum()) # True(全是字母或数字)
print(" ".isspace()) # True(全是空白)
print("Hello".istitle()) # True(标题格式)
print("HELLO".isupper()) # True(全大写)
print("hello".islower()) # True(全小写)
# 数字判断的区别
print("123".isdigit()) # True
print("123".isdecimal()) # True
print("123".isnumeric()) # True
print("½".isdigit()) # False
print("½".isnumeric()) # True(包含更多数字字符)
# 标识符判断
print("my_var".isidentifier()) # True(合法的变量名)
print("2var".isidentifier()) # False
print("import".isidentifier()) # True(但是关键字)
import keyword
print(keyword.iskeyword("import")) # True
4. 正则表达式入门
4.1 什么是正则表达式
正则表达式(Regular Expression,简称regex)是一种强大的文本匹配工具,用于描述字符串的模式。
4.2 基本语法
| 元字符 | 说明 | 示例 |
|---|---|---|
. | 匹配任意字符(除换行) | a.c 匹配 “abc”, “a1c” |
^ | 匹配字符串开头 | ^Hello |
$ | 匹配字符串结尾 | World$ |
* | 匹配0次或多次 | ab* 匹配 “a”, “ab”, “abb” |
+ | 匹配1次或多次 | ab+ 匹配 “ab”, “abb” |
? | 匹配0次或1次 | ab? 匹配 “a”, “ab” |
{n} | 匹配n次 | a{3} 匹配 “aaa” |
{n,m} | 匹配n到m次 | a{2,4} 匹配 “aa”, “aaa”, “aaaa” |
[] | 字符集 | [abc] 匹配 “a”, “b”, “c” |
[^] | 否定字符集 | [^abc] 匹配非a/b/c的字符 |
| | 或 | a|b 匹配 “a” 或 “b” |
() | 分组 | (ab)+ 匹配 “ab”, “abab” |
\d | 数字 [0-9] | \d+ 匹配数字串 |
\D | 非数字 | \D+ 匹配非数字串 |
\w | 单词字符 [a-zA-Z0-9_] | \w+ 匹配单词 |
\W | 非单词字符 | |
\s | 空白字符 | \s+ 匹配空白 |
\S | 非空白字符 | |
\b | 单词边界 | \bword\b 匹配完整单词 |
4.3 re模块常用函数
import re
text = "Hello, my email is [email protected] and phone is 13812345678"
# match:从开头匹配
result = re.match(r"Hello", text)
if result:
print(result.group()) # "Hello"
# search:搜索第一个匹配
result = re.search(r"\d+", text)
if result:
print(result.group()) # "13812345678"
# findall:找出所有匹配
numbers = re.findall(r"\d+", text)
print(numbers) # ['13812345678']
# finditer:返回迭代器
for match in re.finditer(r"\d+", text):
print(f"找到:{match.group()},位置:{match.span()}")
# sub:替换
new_text = re.sub(r"\d+", "***", text)
print(new_text) # "Hello, my email is [email protected] and phone is ***"
# split:分割
parts = re.split(r"\s+", "hello world python")
print(parts) # ['hello', 'world', 'python']
# 编译正则表达式(提高效率)
pattern = re.compile(r"\d+")
result = pattern.findall(text)
# 分组
email_pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(email_pattern, text)
if match:
print(match.group()) # "[email protected]"
print(match.group(1)) # "test"
print(match.group(2)) # "example"
print(match.group(3)) # "com"
print(match.groups()) # ('test', 'example', 'com')
# 命名分组
email_pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<suffix>\w+)"
match = re.search(email_pattern, text)
if match:
print(match.group("user")) # "test"
print(match.group("domain")) # "example"
print(match.groupdict()) # {'user': 'test', 'domain': 'example', 'suffix': 'com'}
# 标志
# re.IGNORECASE (re.I):忽略大小写
# re.MULTILINE (re.M):多行模式
# re.DOTALL (re.S):.匹配包括换行
# re.VERBOSE (re.X):允许注释
pattern = re.compile(r"""
\d{3} # 区号
[-\s]? # 可选的分隔符
\d{4} # 前四位
[-\s]? # 可选的分隔符
\d{4} # 后四位
""", re.VERBOSE)
4.4 实际应用示例
import re
# 1. 验证邮箱
def is_valid_email(email):
pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
return bool(re.match(pattern, email))
print(is_valid_email("[email protected]")) # True
print(is_valid_email("invalid-email")) # False
# 2. 验证手机号(中国大陆)
def is_valid_phone(phone):
pattern = r'^1[3-9]\d{9}$'
return bool(re.match(pattern, phone))
print(is_valid_phone("13812345678")) # True
print(is_valid_phone("12345678901")) # False
# 3. 提取URL
def extract_urls(text):
pattern = r'https?://[\w\.-]+(?:/[\w\.-]*)*'
return re.findall(pattern, text)
text = "访问 https://www.example.com 或 http://test.org/page"
print(extract_urls(text))
# 4. 提取IP地址
def extract_ips(text):
pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
return re.findall(pattern, text)
log = "来自192.168.1.1的请求,转发到10.0.0.1"
print(extract_ips(log)) # ['192.168.1.1', '10.0.0.1']
# 5. 解析日志时间戳
def parse_log_timestamp(log_line):
pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]'
match = re.search(pattern, log_line)
return match.group(1) if match else None
log = "[2024-12-18 14:30:00] INFO: Application started"
print(parse_log_timestamp(log)) # "2024-12-18 14:30:00"
5. 字符串编码
# Python 3字符串默认是Unicode
s = "你好,世界"
print(type(s)) # <class 'str'>
# 编码:str -> bytes
b = s.encode('utf-8')
print(b) # b'\xe4\xbd\xa0\xe5\xa5\xbd...'
print(type(b)) # <class 'bytes'>
# 解码:bytes -> str
s2 = b.decode('utf-8')
print(s2) # "你好,世界"
# 不同编码
s = "你好"
print(s.encode('utf-8')) # b'\xe4\xbd\xa0\xe5\xa5\xbd'(3字节/字)
print(s.encode('gbk')) # b'\xc4\xe3\xba\xc3'(2字节/字)
print(s.encode('utf-16')) # b'\xff\xfe`O}Y'
# 处理编码错误
s = "Hello, 世界"
# 严格模式(默认)
try:
b = s.encode('ascii')
except UnicodeEncodeError as e:
print(f"编码错误:{e}")
# 忽略无法编码的字符
b = s.encode('ascii', errors='ignore')
print(b) # b'Hello, '
# 替换无法编码的字符
b = s.encode('ascii', errors='replace')
print(b) # b'Hello, ??'
# 读取文件时指定编码
with open('file.txt', 'r', encoding='utf-8') as f:
content = f.read()
6. 实际应用场景
6.1 日志解析
import re
from datetime import datetime
def parse_log_line(line):
"""
解析日志行,提取时间、级别、消息
格式:[2024-12-18 14:30:00] [INFO] Message here
"""
pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] \[(\w+)\] (.+)'
match = re.match(pattern, line)
if match:
timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
level = match.group(2)
message = match.group(3)
return {'timestamp': timestamp, 'level': level, 'message': message}
return None
# 示例
log_lines = [
"[2024-12-18 14:30:00] [INFO] Application started",
"[2024-12-18 14:30:01] [ERROR] Connection failed",
"[2024-12-18 14:30:02] [WARNING] Low memory",
]
for line in log_lines:
result = parse_log_line(line)
if result:
print(f"{result['level']}: {result['message']}")
6.2 文本提取
import re
def extract_data_from_report(report):
"""从报告文本中提取关键数据"""
data = {}
# 提取日期
date_match = re.search(r'日期[::]\s*(\d{4}[-/]\d{2}[-/]\d{2})', report)
if date_match:
data['date'] = date_match.group(1)
# 提取金额
amount_match = re.search(r'金额[::]\s*[\¥\$]?([\d,]+\.?\d*)', report)
if amount_match:
data['amount'] = float(amount_match.group(1).replace(',', ''))
# 提取百分比
percent_match = re.search(r'增长率[::]\s*([\d.]+)%', report)
if percent_match:
data['growth_rate'] = float(percent_match.group(1))
return data
report = """
月度销售报告
日期:2024-12-18
总金额:¥1,234,567.89
同比增长率:15.5%
"""
print(extract_data_from_report(report))
# {'date': '2024-12-18', 'amount': 1234567.89, 'growth_rate': 15.5}
6.3 数据清洗
import re
def clean_text(text):
"""清洗文本数据"""
# 去除多余空白
text = re.sub(r'\s+', ' ', text)
# 去除特殊字符
text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
# 去除首尾空白
text = text.strip()
return text
def normalize_phone(phone):
"""标准化电话号码"""
# 去除所有非数字字符
digits = re.sub(r'\D', '', phone)
# 格式化为标准格式
if len(digits) == 11:
return f"{digits[:3]}-{digits[3:7]}-{digits[7:]}"
return digits
def extract_chinese(text):
"""提取中文字符"""
return ''.join(re.findall(r'[\u4e00-\u9fff]+', text))
# 测试
print(clean_text(" Hello World! @#$ 你好 "))
print(normalize_phone("138-1234-5678"))
print(normalize_phone("(138) 1234 5678"))
print(extract_chinese("Hello你好World世界"))
7. 常见错误与避坑
❌ 错误1:忘记使用原始字符串
import re
# 错误:反斜杠被转义
# pattern = "\d+" # 实际是 "d+"
# 正确:使用原始字符串
pattern = r"\d+"
❌ 错误2:字符串不可变
s = "Hello"
# 错误:尝试修改字符串
# s[0] = 'h' # TypeError
# 正确:创建新字符串
s = 'h' + s[1:]
❌ 错误3:编码问题
# 错误:不指定编码读取文件
# with open('file.txt') as f: # 可能乱码
# content = f.read()
# 正确:指定编码
with open('file.txt', encoding='utf-8') as f:
content = f.read()
❌ 错误4:低效的字符串拼接
# 错误:循环中使用+拼接(低效)
result = ""
for i in range(1000):
result += str(i)
# 正确:使用join(高效)
result = "".join(str(i) for i in range(1000))
# 或使用列表
parts = []
for i in range(1000):
parts.append(str(i))
result = "".join(parts)
8. 实战练习
练习1:解析Vivado日志
"""
练习:解析Vivado时序报告,提取关键信息
"""
import re
def parse_timing_report(report):
"""解析Vivado时序报告"""
results = {}
# 提取WNS (Worst Negative Slack)
wns_match = re.search(r'WNS\s*[::=]\s*([-\d.]+)\s*ns', report)
if wns_match:
results['wns'] = float(wns_match.group(1))
# 提取TNS (Total Negative Slack)
tns_match = re.search(r'TNS\s*[::=]\s*([-\d.]+)\s*ns', report)
if tns_match:
results['tns'] = float(tns_match.group(1))
# 提取时钟频率
freq_match = re.search(r'(\d+\.?\d*)\s*MHz', report)
if freq_match:
results['frequency'] = float(freq_match.group(1))
return results
# 测试
report = """
Timing Summary
WNS: 0.123 ns
TNS: 0.000 ns
Clock: clk_100 @ 100.0 MHz
"""
print(parse_timing_report(report))
练习2:批量重命名
"""
练习:批量重命名文件名中的日期格式
"""
import re
def rename_date_format(filename):
"""将文件名中的日期从 MM-DD-YYYY 转换为 YYYY-MM-DD"""
pattern = r'(\d{2})-(\d{2})-(\d{4})'
def replacer(match):
month, day, year = match.groups()
return f"{year}-{month}-{day}"
return re.sub(pattern, replacer, filename)
# 测试
filenames = [
"report_12-18-2024.xlsx",
"data_01-05-2024_backup.csv",
"log_11-30-2024.txt"
]
for name in filenames:
print(f"{name} -> {rename_date_format(name)}")
9. 总结
🔑 核心要点
| 知识点 | 要点 |
|---|---|
| f-string | 最推荐的格式化方式,支持表达式和格式说明符 |
| 字符串方法 | find/replace/split/join/strip等 |
| 正则表达式 | 强大的模式匹配工具,re模块 |
| 编码 | Python 3默认Unicode,注意文件编码 |
| 性能 | 大量拼接用join,避免循环中用+ |
✅ 学习检查清单
- 掌握f-string格式化
- 熟练使用字符串常用方法
- 理解正则表达式基本语法
- 能使用re模块进行文本匹配和提取
- 理解字符串编码问题
📖 下一步学习
掌握了字符串处理后,让我们学习Python的文件操作:
常见问题 FAQ
💬 正则表达式字符串前面为什么要加r?
r""是原始字符串,不对\做转义。正则表达式充满反斜杠,不加r的话\d会被Python解释为转义字符而不是正则元字符。养成习惯:正则表达式一律用r""。
💬 字符串拼接用+还是join?
少量拼接用+或f-string。大量拼接(如循环中)必须用"".join(list),因为+每次都创建新字符串,性能差距可达100倍。
� 系列导航
- 上一篇:08 - Python函数基础
- 当前:09 - Python字符串处理
- 下一篇:10 - Python文件操作