word-analysis

star 4.0k

Word (.docx/.doc) 文档全量解析。覆盖:正文/段落文本提取、表格数据提取、高亮/颜色格式读取、多文件汇总对比、嵌入图片转 caption。

OpenSenseNova By OpenSenseNova schedule Updated 6/4/2026

name: word-analysis description: "Word (.docx/.doc) 文档全量解析。覆盖:正文/段落文本提取、表格数据提取、高亮/颜色格式读取、多文件汇总对比、嵌入图片转 caption。"

Word Analysis — .docx / .doc

Environment

from docx import Document
import os

# python-docx is available; for .doc (old format) convert via libreoffice first
def load_doc(path):
    """Load .docx directly; convert .doc to .docx first if needed."""
    if path.lower().endswith('.doc'):
        import subprocess
        out_dir = os.path.dirname(path)
        subprocess.run(
            ['libreoffice', '--headless', '--convert-to', 'docx', '--outdir', out_dir, path],
            check=True, capture_output=True
        )
        path = path.rsplit('.', 1)[0] + '.docx'
    return Document(path)

Core Method 1: Full Text Extraction

def extract_full_text(doc_path):
    """Extract all text: paragraphs + table cells, in document order."""
    doc = load_doc(doc_path)
    lines = []

    # Iterate paragraphs and tables in body order
    from docx.oxml.ns import qn
    for block in doc.element.body:
        tag = block.tag.split('}')[-1]
        if tag == 'p':
            # Paragraph
            from docx.text.paragraph import Paragraph
            para = Paragraph(block, doc)
            text = para.text.strip()
            if text:
                lines.append(text)
        elif tag == 'tbl':
            # Table
            from docx.table import Table
            tbl = Table(block, doc)
            for row in tbl.rows:
                row_text = '\t'.join(cell.text.strip() for cell in row.cells)
                if row_text.strip():
                    lines.append(row_text)

    return '\n'.join(lines)

# Usage
text = extract_full_text("/mnt/data/doc.docx")
print(text[:2000])  # preview first 2000 chars

Core Method 2: Table Extraction (Structured)

import pandas as pd

def extract_all_tables(doc_path):
    """Extract all tables from a Word document as list of DataFrames."""
    doc = load_doc(doc_path)
    tables = []

    for i, tbl in enumerate(doc.tables):
        rows = []
        for row in tbl.rows:
            rows.append([cell.text.strip() for cell in row.cells])
        if not rows:
            continue
        # Use first row as header if it looks like a header
        df = pd.DataFrame(rows[1:], columns=rows[0]) if rows else pd.DataFrame()
        tables.append((i, df))
        print(f"Table {i}: {df.shape[0]} rows × {df.shape[1]} cols")
        print(df.head(3))

    return tables

# Usage
tables = extract_all_tables("/mnt/data/doc.docx")

Core Method 3: Format-Aware Extraction (Color / Highlight)

Some questions require reading cell background color or text highlight color (e.g., "标黄的行", "红色文字"). Use XML-level access:

from docx import Document
from docx.oxml.ns import qn
from lxml import etree

def get_paragraph_highlight(para):
    """Return highlight color name of first run, or None."""
    for run in para.runs:
        rPr = run._r.find(qn('w:rPr'))
        if rPr is not None:
            hl = rPr.find(qn('w:highlight'))
            if hl is not None:
                return hl.get(qn('w:val'))  # e.g. 'yellow', 'cyan', 'red'
    return None

def get_table_cell_shading(cell):
    """Return background color hex of a table cell, or None."""
    tcPr = cell._tc.find(qn('w:tcPr'))
    if tcPr is not None:
        shd = tcPr.find(qn('w:shd'))
        if shd is not None:
            return shd.get(qn('w:fill'))  # hex color, e.g. 'FFFF00'
    return None

# Example: find all highlighted paragraphs
def find_highlighted_rows(doc_path, color='yellow'):
    doc = load_doc(doc_path)
    highlighted = []
    for i, para in enumerate(doc.paragraphs):
        hl = get_paragraph_highlight(para)
        if hl == color or (color == 'yellow' and hl in ('yellow', 'FFFF00')):
            highlighted.append((i, para.text))
    return highlighted

# For table cells with yellow background:
def find_highlighted_table_cells(doc_path, fill_colors=('FFFF00', 'FFD700')):
    doc = load_doc(doc_path)
    results = []
    for t_idx, tbl in enumerate(doc.tables):
        for r_idx, row in enumerate(tbl.rows):
            for c_idx, cell in enumerate(row.cells):
                color = get_table_cell_shading(cell)
                if color and color.upper() in fill_colors:
                    results.append({
                        'table': t_idx, 'row': r_idx, 'col': c_idx,
                        'color': color, 'text': cell.text.strip()
                    })
    return results

Core Method 4: Multi-File Aggregation

When the user asks about "these files" or the input is a directory:

def process_all_docs(file_list, extractor_fn):
    """Apply extractor to all files and aggregate results."""
    all_results = []
    for path in file_list:
        print(f"\n=== Processing: {os.path.basename(path)} ===")
        try:
            result = extractor_fn(path)
            all_results.append({'file': os.path.basename(path), 'data': result})
        except Exception as e:
            print(f"  ERROR: {e}")
    return all_results

# Example: extract text from all .docx in a directory
doc_files = [f for f in all_files if f.lower().endswith(('.docx', '.doc'))]
results = process_all_docs(doc_files, extract_full_text)

Core Method 5: Embedded Images → Caption

When a Word doc contains embedded images (charts, screenshots):

import zipfile, io, subprocess, json

CAPTION = "/path/to/skills/sn-da-image-caption/scripts/caption.py"

def extract_and_caption_images(doc_path, prompt=None):
    """Extract all images from .docx and caption each one."""
    # .docx is a ZIP archive; images are in word/media/
    results = []
    with zipfile.ZipFile(doc_path, 'r') as z:
        media_files = [n for n in z.namelist() if n.startswith('word/media/')]
        for media in media_files:
            ext = os.path.splitext(media)[-1].lower()
            if ext not in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.wmf', '.emf'):
                continue
            # Save to temp
            tmp_path = f"/tmp/{os.path.basename(media)}"
            with z.open(media) as src, open(tmp_path, 'wb') as dst:
                dst.write(src.read())
            # Caption
            cmd = ["python3", CAPTION, tmp_path, "--json"]
            if prompt:
                cmd += ["--prompt", prompt]
            r = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
            if r.returncode == 0:
                desc = json.loads(r.stdout).get("description", "")
                results.append({'image': media, 'caption': desc})
                print(f"  {media}: {desc[:100]}...")
            else:
                print(f"  {media}: caption failed — {r.stderr[:80]}")
    return results

Common Patterns

Font/size check (字号检查)

from docx.shared import Pt

def check_font_sizes(doc_path):
    doc = load_doc(doc_path)
    issues = []
    for i, para in enumerate(doc.paragraphs):
        for run in para.runs:
            size = run.font.size
            size_pt = size.pt if size else None
            # Also check style-level font
            if size_pt is None:
                style_size = run.style.font.size if run.style else None
                size_pt = style_size.pt if style_size else None
            issues.append({'para': i, 'text': run.text[:30], 'size_pt': size_pt})
    return issues

Spell/grammar check (错别字)

  • Use full-text extraction, then search with string matching or pass to LLM for proofreading
  • Do NOT try to install hunspell or other spell-check tools

Keyword search (全文定位)

def find_keyword(doc_path, keyword):
    text = extract_full_text(doc_path)
    idx = text.find(keyword)
    if idx >= 0:
        context = text[max(0, idx-100):idx+200]
        print(f"Found '{keyword}' at pos {idx}:\n{context}")
    else:
        print(f"'{keyword}' not found. Try broader search.")
        # Try case-insensitive or partial match
        for kw in keyword.split():
            if kw in text:
                print(f"  Partial match for '{kw}'")

Pitfalls

Pitfall Fix
Only read doc.paragraphs, miss tables Use the body-order iterator in Method 1
Single file when input is multi-file Check os.path.isdir(), iterate all
Highlighted cells not detected Use XML-level w:shd / w:highlight (Method 3)
.doc format fails to open Convert to .docx via libreoffice (Method 0)
Embedded charts look empty Extract images from ZIP, caption each (Method 5)
Font size is None Check both run-level and style-level (Method for font check)
Install via CLI
npx skills add https://github.com/OpenSenseNova/SenseNova-Skills --skill word-analysis
Repository Details
star Stars 3,984
call_split Forks 278
navigation Branch main
article Path SKILL.md
More from Creator
OpenSenseNova
OpenSenseNova Explore all skills →