autorename-pdf/_ai_processing.py at main · ptmrio/autorename-pdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
"""
AI content processing with multi-provider support via instructor.
Supports OpenAI, Anthropic (native), Gemini, xAI, and Ollama.
"""
from __future__ import annotations

import base64
import io
import logging

from pydantic import BaseModel, Field
from PIL import Image
import instructor
from openai import OpenAI

from _pdf_utils import ExtractionResult


PROVIDER_BASE_URLS = {
    "openai": None,
    "gemini": "https://generativelanguage.googleapis.com/v1beta/openai/",
    "xai": "https://api.x.ai/v1",
    "ollama": "http://localhost:11434/v1",
}


class DocumentMetadata(BaseModel):
    """Structured output model for document metadata extraction."""
    company_name: str = Field(
        description="Counterparty company name, stripped of legal form (GmbH, AG, Ltd, e.U., SARL, etc.)"
    )
    document_date: str = Field(
        description="Most relevant date (invoice date, letter date) in dd.mm.YYYY format"
    )
    document_type: str = Field(
        description="ER for incoming invoice, AR for outgoing invoice, or short descriptive type"
    )


def get_instructor_client(config: dict):
    """Create an instructor-wrapped client for structured LLM output.

    Most providers route through the OpenAI SDK via compatible endpoints.
    Anthropic uses its native SDK (their OpenAI compat ignores structured output).
    """
    provider = config["ai"]["provider"]
    api_key = config["ai"].get("api_key", "")
    custom_base_url = config["ai"].get("base_url", "")

    supported = list(PROVIDER_BASE_URLS.keys()) + ["anthropic"]
    if provider not in supported:
        raise ValueError(f"Unknown provider: {provider}. Supported: {', '.join(supported)}")
    if provider != "ollama" and not api_key:
        raise ValueError(f"API key required for provider '{provider}'. Set ai.api_key in config.yaml.")

    # Anthropic: use native SDK
    if provider == "anthropic":
        from anthropic import Anthropic
        raw = Anthropic(api_key=api_key)
        return instructor.from_anthropic(raw)

    # All others: OpenAI SDK with provider-specific base_url
    base_url = custom_base_url or PROVIDER_BASE_URLS.get(provider)
    if provider == "ollama":
        api_key = api_key or "ollama"

    raw = OpenAI(api_key=api_key, base_url=base_url)
    # Ollama: use JSON mode for broadest model compatibility (TOOLS requires function calling support)
    mode = instructor.Mode.JSON if provider == "ollama" else instructor.Mode.TOOLS
    return instructor.from_openai(raw, mode=mode)


def build_system_prompt(config: dict) -> str:
    """Build the extraction prompt from config values."""
    company = config.get("company", {}).get("name", "")
    lang = config.get("output", {}).get("language", "English")
    er = config.get("pdf", {}).get("incoming_invoice", "ER")
    ar = config.get("pdf", {}).get("outgoing_invoice", "AR")
    ext = config.get("prompt_extension", "")

    prompt = (
        "You will extract the company name, document date, and document type "
        "from the following document content. "
        "Due to the nature of OCR text detection, the text may be noisy and contain "
        "spelling and detection errors. Handle those as well as possible.\n\n"
        "document_date: Find the most appropriate date (e.g. the invoice date) and "
        "assume the correct date format according to the language and location of the document. "
        "Return format must be: dd.mm.YYYY\n\n"
    )

    if company:
        prompt += (
            f'company_name: Find the name of the company that is the corresponding party '
            f'of the document. My company name is: "{company}", avoid using my company name '
            f'as company_name in the response. For the company_name you always strip the '
            f'legal form (e.U., SARL, GmbH, AG, Ltd, Limited, etc.)\n\n'
        )
    else:
        prompt += (
            "company_name: Find the name of the main company in the document. "
            "Strip the legal form (e.U., SARL, GmbH, AG, Ltd, Limited, etc.)\n\n"
        )

    prompt += (
        f"document_type: Find the best matching type of the document. Valid document types are: "
        f"For incoming invoices (invoices my company receives) use the term '{er}' only, nothing more. "
        f"For outgoing invoices (invoices my company sends) use the term '{ar}', nothing more. "
        f"For all other document types, always find a short descriptive summary/subject in {lang} language.\n\n"
        "If a value is not found, leave it empty."
    )

    if ext:
        prompt += f"\n\n{ext}"

    return prompt.strip()


def pil_to_base64_data_uri(image: Image.Image, fmt: str = "PNG") -> str:
    """Convert a PIL image to a base64 data URI."""
    buf = io.BytesIO()
    image.save(buf, format=fmt)
    b64 = base64.b64encode(buf.getvalue()).decode()
    return f"data:image/{fmt.lower()};base64,{b64}"


def build_image_content(images: list, provider: str) -> list[dict]:
    """Build image content blocks in the format expected by the provider."""
    if provider == "anthropic":
        result = []
        for img in images:
            buf = io.BytesIO()
            img.save(buf, format="PNG")
            b64 = base64.b64encode(buf.getvalue()).decode()
            result.append({
                "type": "image",
                "source": {"type": "base64", "media_type": "image/png", "data": b64},
            })
        return result
    return [
        {"type": "image_url", "image_url": {"url": pil_to_base64_data_uri(img)}}
        for img in images
    ]


def extract_metadata_from_text(text: str, config: dict) -> DocumentMetadata:
    """Extract document metadata from text using an LLM."""
    client = get_instructor_client(config)
    provider = config["ai"]["provider"]

    kwargs = {
        "model": config["ai"]["model"],
        "response_model": DocumentMetadata,
        "max_retries": config["ai"].get("max_retries", 2),
        "temperature": config["ai"].get("temperature", 0.0),
        "messages": [
            {"role": "system", "content": build_system_prompt(config)},
            {"role": "user", "content": f"Extract the information from this text:\n\n{text}"}
        ],
    }

    # Anthropic uses max_tokens instead of being optional
    if provider == "anthropic":
        kwargs["max_tokens"] = 1024

    return client.chat.completions.create(**kwargs)


def extract_metadata_from_images(images: list, config: dict) -> DocumentMetadata:
    """Extract document metadata from page images using a vision-capable LLM."""
    client = get_instructor_client(config)
    provider = config["ai"]["provider"]

    image_content = build_image_content(images, provider)

    kwargs = {
        "model": config["ai"]["model"],
        "response_model": DocumentMetadata,
        "max_retries": config["ai"].get("max_retries", 2),
        "temperature": config["ai"].get("temperature", 0.0),
        "messages": [
            {"role": "system", "content": build_system_prompt(config)},
            {"role": "user", "content": [
                {"type": "text", "text": "Extract document metadata from these page images:"},
                *image_content
            ]}
        ],
    }

    if provider == "anthropic":
        kwargs["max_tokens"] = 1024

    return client.chat.completions.create(**kwargs)


def _build_combined_text(extraction: ExtractionResult) -> str:
    """Merge pdfplumber text and OCR text into a single string for the AI."""
    parts = []
    if extraction.text.strip():
        parts.append(extraction.text)
    if extraction.ocr_text.strip():
        if parts:
            parts.append("\n--- OCR Text ---\n")
        parts.append(extraction.ocr_text)
    return "\n".join(parts)


def extract_metadata_from_text_and_images(
    text: str, images: list, config: dict
) -> DocumentMetadata:
    """Extract metadata from combined text + page images (multimodal)."""
    client = get_instructor_client(config)
    provider = config["ai"]["provider"]

    image_content = build_image_content(images, provider)

    kwargs = {
        "model": config["ai"]["model"],
        "response_model": DocumentMetadata,
        "max_retries": config["ai"].get("max_retries", 2),
        "temperature": config["ai"].get("temperature", 0.0),
        "messages": [
            {"role": "system", "content": build_system_prompt(config)},
            {"role": "user", "content": [
                {"type": "text", "text": f"Extract document metadata from this text and images:\n\n{text}"},
                *image_content,
            ]},
        ],
    }

    if provider == "anthropic":
        kwargs["max_tokens"] = 1024

    return client.chat.completions.create(**kwargs)


def extract_metadata(extraction: ExtractionResult, config: dict) -> DocumentMetadata | None:
    """Extract metadata from an ExtractionResult using the appropriate method."""
    combined_text = _build_combined_text(extraction)
    has_text = bool(combined_text.strip())
    has_images = bool(extraction.images)

    if has_text and has_images:
        return extract_metadata_from_text_and_images(combined_text, extraction.images, config)
    elif has_images:
        return extract_metadata_from_images(extraction.images, config)
    elif has_text:
        return extract_metadata_from_text(combined_text, config)
    else:
        logging.error("No text or images available for metadata extraction")
        return None