name: word-analysis description: "Word (.docx/.doc) 文档全量解析。覆盖:正文/段落文本提取、表格数据提取、高亮/颜色格式读取、多文件汇总对比、嵌入图片转 caption。"
Word Analysis — .docx / .doc
Environment
from docx import Document
import os
# python-docx is available; for .doc (old format) convert via libreoffice first
def load_doc(path):
"""Load .docx directly; convert .doc to .docx first if needed."""
if path.lower().endswith('.doc'):
import subprocess
out_dir = os.path.dirname(path)
subprocess.run(
['libreoffice', '--headless', '--convert-to', 'docx', '--outdir', out_dir, path],
check=True, capture_output=True
)
path = path.rsplit('.', 1)[0] + '.docx'
return Document(path)
Core Method 1: Full Text Extraction
def extract_full_text(doc_path):
"""Extract all text: paragraphs + table cells, in document order."""
doc = load_doc(doc_path)
lines = []
# Iterate paragraphs and tables in body order
from docx.oxml.ns import qn
for block in doc.element.body:
tag = block.tag.split('}')[-1]
if tag == 'p':
# Paragraph
from docx.text.paragraph import Paragraph
para = Paragraph(block, doc)
text = para.text.strip()
if text:
lines.append(text)
elif tag == 'tbl':
# Table
from docx.table import Table
tbl = Table(block, doc)
for row in tbl.rows:
row_text = '\t'.join(cell.text.strip() for cell in row.cells)
if row_text.strip():
lines.append(row_text)
return '\n'.join(lines)
# Usage
text = extract_full_text("/mnt/data/doc.docx")
print(text[:2000]) # preview first 2000 chars
Core Method 2: Table Extraction (Structured)
import pandas as pd
def extract_all_tables(doc_path):
"""Extract all tables from a Word document as list of DataFrames."""
doc = load_doc(doc_path)
tables = []
for i, tbl in enumerate(doc.tables):
rows = []
for row in tbl.rows:
rows.append([cell.text.strip() for cell in row.cells])
if not rows:
continue
# Use first row as header if it looks like a header
df = pd.DataFrame(rows[1:], columns=rows[0]) if rows else pd.DataFrame()
tables.append((i, df))
print(f"Table {i}: {df.shape[0]} rows × {df.shape[1]} cols")
print(df.head(3))
return tables
# Usage
tables = extract_all_tables("/mnt/data/doc.docx")
Core Method 3: Format-Aware Extraction (Color / Highlight)
Some questions require reading cell background color or text highlight color (e.g., "标黄的行", "红色文字"). Use XML-level access:
from docx import Document
from docx.oxml.ns import qn
from lxml import etree
def get_paragraph_highlight(para):
"""Return highlight color name of first run, or None."""
for run in para.runs:
rPr = run._r.find(qn('w:rPr'))
if rPr is not None:
hl = rPr.find(qn('w:highlight'))
if hl is not None:
return hl.get(qn('w:val')) # e.g. 'yellow', 'cyan', 'red'
return None
def get_table_cell_shading(cell):
"""Return background color hex of a table cell, or None."""
tcPr = cell._tc.find(qn('w:tcPr'))
if tcPr is not None:
shd = tcPr.find(qn('w:shd'))
if shd is not None:
return shd.get(qn('w:fill')) # hex color, e.g. 'FFFF00'
return None
# Example: find all highlighted paragraphs
def find_highlighted_rows(doc_path, color='yellow'):
doc = load_doc(doc_path)
highlighted = []
for i, para in enumerate(doc.paragraphs):
hl = get_paragraph_highlight(para)
if hl == color or (color == 'yellow' and hl in ('yellow', 'FFFF00')):
highlighted.append((i, para.text))
return highlighted
# For table cells with yellow background:
def find_highlighted_table_cells(doc_path, fill_colors=('FFFF00', 'FFD700')):
doc = load_doc(doc_path)
results = []
for t_idx, tbl in enumerate(doc.tables):
for r_idx, row in enumerate(tbl.rows):
for c_idx, cell in enumerate(row.cells):
color = get_table_cell_shading(cell)
if color and color.upper() in fill_colors:
results.append({
'table': t_idx, 'row': r_idx, 'col': c_idx,
'color': color, 'text': cell.text.strip()
})
return results
Core Method 4: Multi-File Aggregation
When the user asks about "these files" or the input is a directory:
def process_all_docs(file_list, extractor_fn):
"""Apply extractor to all files and aggregate results."""
all_results = []
for path in file_list:
print(f"\n=== Processing: {os.path.basename(path)} ===")
try:
result = extractor_fn(path)
all_results.append({'file': os.path.basename(path), 'data': result})
except Exception as e:
print(f" ERROR: {e}")
return all_results
# Example: extract text from all .docx in a directory
doc_files = [f for f in all_files if f.lower().endswith(('.docx', '.doc'))]
results = process_all_docs(doc_files, extract_full_text)
Core Method 5: Embedded Images → Caption
When a Word doc contains embedded images (charts, screenshots):
import zipfile, io, subprocess, json
CAPTION = "/path/to/skills/sn-da-image-caption/scripts/caption.py"
def extract_and_caption_images(doc_path, prompt=None):
"""Extract all images from .docx and caption each one."""
# .docx is a ZIP archive; images are in word/media/
results = []
with zipfile.ZipFile(doc_path, 'r') as z:
media_files = [n for n in z.namelist() if n.startswith('word/media/')]
for media in media_files:
ext = os.path.splitext(media)[-1].lower()
if ext not in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.wmf', '.emf'):
continue
# Save to temp
tmp_path = f"/tmp/{os.path.basename(media)}"
with z.open(media) as src, open(tmp_path, 'wb') as dst:
dst.write(src.read())
# Caption
cmd = ["python3", CAPTION, tmp_path, "--json"]
if prompt:
cmd += ["--prompt", prompt]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if r.returncode == 0:
desc = json.loads(r.stdout).get("description", "")
results.append({'image': media, 'caption': desc})
print(f" {media}: {desc[:100]}...")
else:
print(f" {media}: caption failed — {r.stderr[:80]}")
return results
Common Patterns
Font/size check (字号检查)
from docx.shared import Pt
def check_font_sizes(doc_path):
doc = load_doc(doc_path)
issues = []
for i, para in enumerate(doc.paragraphs):
for run in para.runs:
size = run.font.size
size_pt = size.pt if size else None
# Also check style-level font
if size_pt is None:
style_size = run.style.font.size if run.style else None
size_pt = style_size.pt if style_size else None
issues.append({'para': i, 'text': run.text[:30], 'size_pt': size_pt})
return issues
Spell/grammar check (错别字)
- Use full-text extraction, then search with string matching or pass to LLM for proofreading
- Do NOT try to install hunspell or other spell-check tools
Keyword search (全文定位)
def find_keyword(doc_path, keyword):
text = extract_full_text(doc_path)
idx = text.find(keyword)
if idx >= 0:
context = text[max(0, idx-100):idx+200]
print(f"Found '{keyword}' at pos {idx}:\n{context}")
else:
print(f"'{keyword}' not found. Try broader search.")
# Try case-insensitive or partial match
for kw in keyword.split():
if kw in text:
print(f" Partial match for '{kw}'")
Pitfalls
| Pitfall | Fix |
|---|---|
Only read doc.paragraphs, miss tables |
Use the body-order iterator in Method 1 |
| Single file when input is multi-file | Check os.path.isdir(), iterate all |
| Highlighted cells not detected | Use XML-level w:shd / w:highlight (Method 3) |
.doc format fails to open |
Convert to .docx via libreoffice (Method 0) |
| Embedded charts look empty | Extract images from ZIP, caption each (Method 5) |
| Font size is None | Check both run-level and style-level (Method for font check) |