infoxtractor/tests/unit/test_contracts.py
Dirk Riemann 181cc0fbea
All checks were successful
tests / test (push) Successful in 1m2s
tests / test (pull_request) Successful in 1m6s
feat(contracts): RequestIX + Context + Options per spec §3
Adds the incoming-request data contracts as Pydantic v2 models. Matches the
MVP spec §3 exactly — fields dropped from the reference spec (use_vision,
reasoning_effort, version, ...) stay out, and `extra="forbid"` catches any
caller that sends them so drift surfaces immediately instead of silently.

Context.files is `list[str | FileRef]`: plain URLs stay str, dict entries
parse as FileRef. This keeps the common case (public URL) one-liner while
still supporting Paperless-style auth headers and per-file size caps.

ix_id stays optional with a docstring warning that callers MUST NOT set it —
the transport layer assigns the 16-char hex handle on insert. The field is
present so `Job` round-trips out of the store.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 10:47:31 +02:00

168 lines
5.8 KiB
Python

"""Pydantic contracts — RequestIX and its nested option structures (spec §3)."""
from __future__ import annotations
import json
import pytest
from pydantic import ValidationError
from ix.contracts import (
Context,
FileRef,
GenAIOptions,
OCROptions,
Options,
ProvenanceOptions,
RequestIX,
)
class TestFileRef:
def test_minimal(self) -> None:
fr = FileRef(url="https://example.com/x.pdf")
assert fr.url == "https://example.com/x.pdf"
assert fr.headers == {}
assert fr.max_bytes is None
def test_with_headers_and_max_bytes(self) -> None:
fr = FileRef(
url="https://paperless/x.pdf",
headers={"Authorization": "Token abc"},
max_bytes=1_000_000,
)
assert fr.headers == {"Authorization": "Token abc"}
assert fr.max_bytes == 1_000_000
class TestOptionDefaults:
def test_ocr_defaults_match_spec(self) -> None:
o = OCROptions()
assert o.use_ocr is True
assert o.ocr_only is False
assert o.include_ocr_text is False
assert o.include_geometries is False
assert o.service == "surya"
def test_genai_defaults_match_spec(self) -> None:
g = GenAIOptions()
assert g.gen_ai_model_name is None
def test_provenance_defaults_match_spec(self) -> None:
p = ProvenanceOptions()
assert p.include_provenance is True
assert p.max_sources_per_field == 10
def test_options_default_nests_each_block(self) -> None:
opts = Options()
assert isinstance(opts.ocr, OCROptions)
assert isinstance(opts.gen_ai, GenAIOptions)
assert isinstance(opts.provenance, ProvenanceOptions)
class TestContextFiles:
def test_plain_string_entry_preserved_as_str(self) -> None:
ctx = Context(files=["https://example.com/a.pdf"])
assert ctx.files == ["https://example.com/a.pdf"]
assert isinstance(ctx.files[0], str)
def test_dict_entry_parsed_as_fileref(self) -> None:
ctx = Context(files=[{"url": "https://x/a.pdf", "headers": {"H": "v"}}])
assert len(ctx.files) == 1
entry = ctx.files[0]
assert isinstance(entry, FileRef)
assert entry.url == "https://x/a.pdf"
assert entry.headers == {"H": "v"}
def test_mixed_entries(self) -> None:
ctx = Context(
files=[
"file:///tmp/x.pdf",
{"url": "https://paperless/y.pdf", "headers": {"Authorization": "Token t"}},
],
texts=["extra ocr text"],
)
assert isinstance(ctx.files[0], str)
assert isinstance(ctx.files[1], FileRef)
assert ctx.texts == ["extra ocr text"]
def test_empty_defaults(self) -> None:
ctx = Context()
assert ctx.files == []
assert ctx.texts == []
class TestRequestIX:
def _minimal_payload(self) -> dict:
return {
"use_case": "bank_statement_header",
"ix_client_id": "mammon",
"request_id": "req-1",
"context": {"files": ["https://example/x.pdf"]},
}
def test_minimal_valid(self) -> None:
r = RequestIX(**self._minimal_payload())
assert r.use_case == "bank_statement_header"
assert r.ix_id is None
assert r.callback_url is None
assert r.options.provenance.include_provenance is True
def test_roundtrip_json_mixed_files(self) -> None:
payload = {
"use_case": "bank_statement_header",
"ix_client_id": "mammon",
"request_id": "req-42",
"context": {
"files": [
"file:///tmp/x.pdf",
{
"url": "https://paperless/y.pdf",
"headers": {"Authorization": "Token t"},
"max_bytes": 2_000_000,
},
],
"texts": ["paperless ocr text"],
},
"options": {
"ocr": {"include_ocr_text": True},
"gen_ai": {"gen_ai_model_name": "gpt-oss:20b"},
"provenance": {"max_sources_per_field": 5},
},
"callback_url": "https://mammon/ix-callback",
}
r = RequestIX.model_validate(payload)
assert isinstance(r.context.files[0], str)
assert isinstance(r.context.files[1], FileRef)
assert r.context.files[1].headers == {"Authorization": "Token t"}
assert r.options.ocr.include_ocr_text is True
assert r.options.gen_ai.gen_ai_model_name == "gpt-oss:20b"
assert r.options.provenance.max_sources_per_field == 5
assert r.callback_url == "https://mammon/ix-callback"
# Round-trip through JSON and back: FileRef dicts survive as FileRef.
dumped = r.model_dump_json()
r2 = RequestIX.model_validate_json(dumped)
assert isinstance(r2.context.files[1], FileRef)
assert r2.context.files[1].headers == {"Authorization": "Token t"}
# dumped JSON is valid JSON
json.loads(dumped)
def test_unknown_fields_rejected(self) -> None:
payload = self._minimal_payload()
payload["not_a_field"] = "x"
with pytest.raises(ValidationError):
RequestIX.model_validate(payload)
def test_ix_id_optional_and_documented(self) -> None:
# The docstring contract: caller MUST NOT set; transport assigns.
# Here we only assert the field exists and defaults to None — the
# "MUST NOT set" is a convention enforced at the transport layer.
r = RequestIX(**self._minimal_payload())
assert r.ix_id is None
assert "transport" in RequestIX.__doc__.lower() or "MUST NOT" in (RequestIX.__doc__ or "")
def test_missing_required_fields(self) -> None:
with pytest.raises(ValidationError):
RequestIX.model_validate({"use_case": "x"})