from docx.shared import Pt
from docx.enum.text import WD_COLOR_INDEX
from collections import Counter
def r(lis):
counter = Counter(lis)
return counter.most_common(1)[0][0]
def parse_word_font(doc_path):
"""
解析Word文档的字体信息,返回每个文本块的字体属性
:param doc_path: Word文档路径(.docx)
:return: 列表,每个元素为{text: 文本内容, font: 字体属性字典}
"""
doc = Document(doc_path)
font_info_list = []
fontsizeall=[]
# 1. 解析段落文本的字体
for d,para in enumerate(doc.paragraphs):
if '一九七五' in para.text:
pass
if not para.text.strip(): # 跳过空段落
continue
# 段落可能包含多个run(不同字体样式的文本块)
if para.style.font.size:
parafont=para.style.font.size.pt
else:
parafont=None
# word文档解析原理.Word里面顶层是paragraph, 如果paragraph里面写字号大小了.那么里面的子结构runs里面的字体大小就是None!
tmp=[]
for run in para.runs:
if not run.text.strip():
continue
# 提取字体核心属性
font_attr = {
"name": run.font.name, # 字体名称(如"微软雅黑")
"size": run.font.size.pt if run.font.size else parafont, # 字号(磅)
"bold": run.font.bold, # 加粗(True/False/None)
"italic": run.font.italic, # 斜体(True/False/None)
"underline": run.font.underline, # 下划线(None/True/样式)
"color": run.font.color.rgb if run.font.color.rgb else None, # 字体颜色(RGB)
"highlight": run.font.highlight_color, # 高亮颜色(WD_COLOR_INDEX枚举)
"strike": run.font.strike, # 删除线(True/False/None)
"subscript": run.font.subscript, # 下标(True/False/None)
"superscript": run.font.superscript, # 上标(True/False/None)
}
fontsizeall.append(font_attr['size'])
tmp.append({
"text": run.text,
"font": font_attr,
"type": "paragraph" # 标记文本类型:段落
})
tmp_fixed={}
tmp_fixed['text']=''.join([i['text'] for i in tmp])
tmp_fixed['name']=r([i['font']['name'] for i in tmp])
tmp_fixed['size']=r([i['font']['size'] for i in tmp])
tmp_fixed['bold']=r([i['font']['bold'] for i in tmp])
tmp_fixed['italic']=r([i['font']['italic'] for i in tmp])
tmp_fixed['underline']=r([i['font']['underline'] for i in tmp])
tmp_fixed['strike']=r([i['font']['strike'] for i in tmp])
tmp_fixed['subscript']=r([i['font']['subscript'] for i in tmp])
tmp_fixed['superscript']=r([i['font']['superscript'] for i in tmp])
tmp_fixed['color']=r([i['font']['color'] for i in tmp])
tmp_fixed['type']=r([i['type'] for i in tmp])
font_info_list.append(tmp_fixed)
# 2. 解析表格中的文本字体
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
for run in para.runs:
if not run.text.strip():
continue
font_attr = {
"name": run.font.name,
"size": run.font.size.pt if run.font.size else None,
"bold": run.font.bold,
"italic": run.font.italic,
"underline": run.font.underline,
"color": run.font.color.rgb if run.font.color.rgb else None,
"highlight": run.font.highlight_color,
}
fontsizeall.append(font_attr['size'])
font_info_list.append({
"text": run.text,
**font_attr,
"type": "table_cell" # 标记文本类型:表格单元格
})
fontsizeall=[i for i in fontsizeall if i is not None]
originsize=r(fontsizeall)
for d,i in enumerate(font_info_list):
font_info_list[d]['size']=originsize if font_info_list[d]['size'] is None else font_info_list[d]['size']
return font_info_list
# 调用示例
if __name__ == "__main__":
doc_path = "new2\叶圣陶集 第25卷 书信 2 第2版_11994042.docx" # 替换为你的Word文档路径
font_info = parse_word_font(doc_path)
# 打印解析结果
for idx, item in enumerate(font_info[:10]):
print(f"=== 文本块 {idx+1} ===")
print(f"文本内容: {item['text']}")
print(f"文本类型: {item['type']}")
print("字体属性:")
for k, v in item['font'].items():
# 格式化颜色/高亮显示
if k == "color" and v:
v = f"RGB({v[0]}, {v[1]}, {v[2]})"
elif k == "highlight" and v:
v = f"{v.name} ({v.value})"
print(f" {k}: {v}")
print("-" * 50)