document-processing

star 0

Document processing with LLM/AI for invoice recognition, OCR, data extraction, and integration with ERP/SAP/Google Sheets/1C/Bitrix systems.

skomax By skomax schedule Updated 3/5/2026

name: document-processing description: Document processing with LLM/AI for invoice recognition, OCR, data extraction, and integration with ERP/SAP/Google Sheets/1C/Bitrix systems.

Document Processing Skill

When to Activate

  • Processing invoices, receipts, bills of lading (TTN)
  • OCR and document recognition
  • LLM-based data extraction from documents
  • Integration with Google Sheets, 1C, Bitrix, SAP/ERP
  • Multi-user document processing pipelines

Architecture

User Upload -> OCR/Vision -> LLM Extraction -> Validation -> Export
                                                               |
                                              +-------+--------+--------+
                                              |       |        |        |
                                           Google   1C API   Bitrix   SAP
                                           Sheets            API      API

Document Processing Pipeline

from dataclasses import dataclass
from enum import Enum

class DocumentType(Enum):
    INVOICE = "invoice"
    RECEIPT = "receipt"
    BILL_OF_LADING = "bill_of_lading"  # TTN
    PURCHASE_ORDER = "purchase_order"
    CUSTOM = "custom"

@dataclass
class ExtractedData:
    document_type: DocumentType
    raw_text: str
    structured_data: dict       # Extracted fields
    confidence: float           # 0.0 - 1.0
    language: str               # Detected language
    needs_review: bool          # Flag for manual review

class DocumentProcessor:
    def __init__(self, llm_client, ocr_engine):
        self.llm = llm_client
        self.ocr = ocr_engine

    async def process(self, file_path: str, user_settings: UserSettings) -> ExtractedData:
        # Step 1: OCR / Vision extraction
        raw_text = await self.extract_text(file_path)

        # Step 2: LLM-based structured extraction
        structured = await self.extract_fields(raw_text, user_settings)

        # Step 3: Translation if needed
        if user_settings.translate_to:
            structured = await self.translate(structured, user_settings.translate_to)

        # Step 4: Validation
        validated = self.validate(structured)

        return validated

LLM Extraction with Anthropic SDK

import json
import anthropic

class LLMExtractor:
    def __init__(self):
        self.client = anthropic.AsyncAnthropic()

    async def extract_invoice(self, text: str) -> dict:
        response = await self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=2000,
            messages=[{
                "role": "user",
                "content": f"""Extract structured data from this invoice.
Return JSON with these fields:
- invoice_number: string
- date: YYYY-MM-DD
- supplier_name: string
- supplier_tax_id: string
- buyer_name: string
- buyer_tax_id: string
- items: array of {{description, quantity, unit_price, total, vat_rate}}
- subtotal: number
- vat_amount: number
- total_amount: number
- currency: string

Invoice text:
{text}"""
            }],
        )
        try:
            return json.loads(response.content[0].text)
        except json.JSONDecodeError:
            raise ValueError(f"LLM returned invalid JSON: {response.content[0].text[:200]}")

    async def extract_with_vision(self, image_path: str) -> dict:
        """Process document image directly with Claude Vision."""
        import base64
        with open(image_path, "rb") as f:
            image_data = base64.standard_b64encode(f.read()).decode("utf-8")

        response = await self.client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=2000,
            messages=[{
                "role": "user",
                "content": [
                    {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_data}},
                    {"type": "text", "text": "Extract all data from this document. Return structured JSON."}
                ]
            }],
        )
        try:
            return json.loads(response.content[0].text)
        except json.JSONDecodeError:
            raise ValueError(f"LLM returned invalid JSON: {response.content[0].text[:200]}")

Export Integrations

Google Sheets

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build

class GoogleSheetsExporter:
    def __init__(self, credentials_path: str):
        creds = Credentials.from_service_account_file(credentials_path)
        self.service = build("sheets", "v4", credentials=creds)

    async def export(self, data: ExtractedData, template_id: str, user_config: dict):
        """Export to user-specific Google Sheet template."""
        import asyncio
        sheet_id = user_config["sheet_id"]
        sheet_range = user_config.get("range", "Sheet1!A1")
        field_mapping = user_config["field_mapping"]  # Maps extracted fields to columns

        values = self.map_to_template(data, field_mapping)

        # googleapiclient is synchronous - run in thread to avoid blocking event loop
        await asyncio.to_thread(
            self.service.spreadsheets().values().append(
                spreadsheetId=sheet_id,
                range=sheet_range,
                valueInputOption="USER_ENTERED",
                body={"values": [values]},
            ).execute
        )

1C Integration

class OneCExporter:
    def __init__(self, base_url: str, username: str, password: str):
        self.base_url = base_url
        self.auth = (username, password)

    async def export_invoice(self, data: ExtractedData):
        """Export invoice to 1C via OData/REST."""
        payload = {
            "Number": data.structured_data["invoice_number"],
            "Date": data.structured_data["date"],
            "Contractor": data.structured_data["supplier_name"],
            "Amount": data.structured_data["total_amount"],
            "Items": [
                {"Description": item["description"], "Quantity": item["quantity"],
                 "Price": item["unit_price"], "Amount": item["total"]}
                for item in data.structured_data["items"]
            ],
        }
        async with httpx.AsyncClient() as client:
            response = await client.post(
                f"{self.base_url}/odata/standard.odata/Document_Invoice",
                json=payload, auth=self.auth,
            )
            response.raise_for_status()

Bitrix24 Integration

class Bitrix24Exporter:
    def __init__(self, webhook_url: str):
        self.webhook_url = webhook_url

    async def create_deal(self, data: ExtractedData):
        async with httpx.AsyncClient() as client:
            await client.post(
                f"{self.webhook_url}/crm.deal.add.json",
                json={"fields": {
                    "TITLE": f"Invoice {data.structured_data['invoice_number']}",
                    "OPPORTUNITY": data.structured_data["total_amount"],
                    "CURRENCY_ID": data.structured_data["currency"],
                }},
            )

Multi-User Configuration

@dataclass
class UserSettings:
    user_id: str
    translate_to: str | None = None        # Target language code
    export_type: str = "google_sheets"      # google_sheets | 1c | bitrix | sap
    export_config: dict = field(default_factory=dict)  # Per-user export settings  # noqa: from dataclasses import field
    store_in_db: bool = True                # Store extracted data in DB
    auto_approve: bool = False              # Skip manual review
    custom_fields: list[str] | None = None  # Additional fields to extract

OCR Engine Selection

Engine Best For Speed Accuracy Setup
Claude Vision Any document, handwritten Fast Highest API key only
Tesseract Printed text, free/local Medium Good apt install tesseract-ocr
EasyOCR Multi-language, GPU-accelerated Medium Good pip install easyocr
PaddleOCR Complex layouts, tables Fast Very Good pip install paddleocr

Tesseract Integration

import pytesseract
from PIL import Image

def ocr_tesseract(image_path: str, lang: str = "eng+ukr") -> str:
    """OCR with Tesseract. Install: apt install tesseract-ocr tesseract-ocr-ukr"""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang=lang, config="--psm 6")
    return text.strip()

def ocr_to_hocr(image_path: str) -> str:
    """Get positional data (bounding boxes) for each word."""
    return pytesseract.image_to_pdf_or_hocr(image_path, extension="hocr")

EasyOCR Integration

import easyocr

def ocr_easyocr(image_path: str, languages: list[str] = ["en", "uk"]) -> str:
    reader = easyocr.Reader(languages, gpu=True)
    results = reader.readtext(image_path)
    return "\n".join([text for _, text, conf in results if conf > 0.5])

PDF Processing

import fitz  # PyMuPDF
import pdfplumber

# PyMuPDF - fast text extraction
def extract_pdf_text(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# pdfplumber - table extraction
def extract_pdf_tables(pdf_path: str) -> list[list[list[str]]]:
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_tables = page.extract_tables()
            tables.extend(page_tables)
    return tables

# Combined: text + tables + images
async def process_pdf_complete(pdf_path: str, llm: LLMExtractor) -> ExtractedData:
    text = extract_pdf_text(pdf_path)
    tables = extract_pdf_tables(pdf_path)

    # If text extraction poor (scanned PDF), use OCR
    if len(text.strip()) < 50:
        doc = fitz.open(pdf_path)
        for page in doc:
            pix = page.get_pixmap(dpi=300)
            img_path = f"/tmp/page_{page.number}.png"
            pix.save(img_path)
            text += await llm.extract_with_vision(img_path)

    return await llm.extract_invoice(text + "\n\nTables:\n" + str(tables))

SAP Integration via RFC

# pip install pyrfc (requires SAP NW RFC SDK)
from pyrfc import Connection

class SAPExporter:
    def __init__(self, ashost: str, sysnr: str, client: str, user: str, passwd: str):
        self.conn_params = {
            "ashost": ashost, "sysnr": sysnr,
            "client": client, "user": user, "passwd": passwd,
        }

    def export_invoice(self, data: ExtractedData):
        with Connection(**self.conn_params) as conn:
            result = conn.call("BAPI_INCOMINGINVOICE_CREATE", {
                "HEADERDATA": {
                    "DOC_TYPE": "RE",
                    "COMP_CODE": "1000",
                    "DOC_DATE": data.structured_data["date"],
                    "REF_DOC_NO": data.structured_data["invoice_number"],
                    "GROSS_AMOUNT": data.structured_data["total_amount"],
                    "CURRENCY": data.structured_data["currency"],
                },
                "ITEMDATA": [
                    {"INVOICE_DOC_ITEM": str(i + 1).zfill(10),
                     "AMOUNT": item["total"],
                     "TAX_CODE": self._map_vat_code(item["vat_rate"])}
                    for i, item in enumerate(data.structured_data["items"])
                ],
            })
            if result.get("RETURN", [{}])[0].get("TYPE") == "E":
                raise Exception(f"SAP error: {result['RETURN'][0]['MESSAGE']}")
            return result

Validation Rules Engine

from dataclasses import dataclass

@dataclass
class ValidationRule:
    field: str
    rule_type: str   # required | format | range | cross_field
    params: dict

class DataValidator:
    RULES = {
        "invoice": [
            ValidationRule("invoice_number", "required", {}),
            ValidationRule("date", "format", {"pattern": r"\d{4}-\d{2}-\d{2}"}),
            ValidationRule("total_amount", "range", {"min": 0.01, "max": 99_999_999}),
            ValidationRule("total_amount", "cross_field", {
                "check": "equals_sum", "sum_field": "items.total",
            }),
            ValidationRule("vat_amount", "cross_field", {
                "check": "percentage_of", "base_field": "subtotal",
                "tolerance": 0.01,
            }),
        ],
    }

    def validate(self, data: ExtractedData) -> list[str]:
        errors = []
        rules = self.RULES.get(data.document_type.value, [])
        for rule in rules:
            value = data.structured_data.get(rule.field)
            if rule.rule_type == "required" and not value:
                errors.append(f"Missing required field: {rule.field}")
            elif rule.rule_type == "range":
                if value and not (rule.params["min"] <= value <= rule.params["max"]):
                    errors.append(f"{rule.field} out of range: {value}")
            elif rule.rule_type == "cross_field" and rule.params["check"] == "equals_sum":
                items_sum = sum(i.get("total", 0) for i in data.structured_data.get("items", []))
                if abs(value - items_sum) > 0.01:
                    errors.append(f"{rule.field} ({value}) != sum of items ({items_sum})")
        return errors

Admin Panel Features

  • User management (create, configure, deactivate)
  • Per-user export template configuration
  • Processing statistics and error logs
  • Manual review queue for low-confidence extractions
  • Custom field mapping editor
  • API key management for external integrations
  • Document processing history with search/filter
  • Batch processing mode (upload ZIP with multiple documents)
  • Webhook notifications on processing completion
Install via CLI
npx skills add https://github.com/skomax/skills --skill document-processing
Repository Details
star Stars 0
call_split Forks 0
navigation Branch main
article Path SKILL.md
More from Creator