diff --git a/src/ix/contracts/__init__.py b/src/ix/contracts/__init__.py new file mode 100644 index 0000000..2f57a45 --- /dev/null +++ b/src/ix/contracts/__init__.py @@ -0,0 +1,27 @@ +"""Pydantic v2 data contracts shared by the pipeline, adapters, and store. + +Re-exports the public symbols from sibling modules so call sites can write +``from ix.contracts import RequestIX`` without chasing the submodule layout. +""" + +from __future__ import annotations + +from ix.contracts.request import ( + Context, + FileRef, + GenAIOptions, + OCROptions, + Options, + ProvenanceOptions, + RequestIX, +) + +__all__ = [ + "Context", + "FileRef", + "GenAIOptions", + "OCROptions", + "Options", + "ProvenanceOptions", + "RequestIX", +] diff --git a/src/ix/contracts/request.py b/src/ix/contracts/request.py new file mode 100644 index 0000000..58e3d3c --- /dev/null +++ b/src/ix/contracts/request.py @@ -0,0 +1,103 @@ +"""Incoming request contracts — :class:`RequestIX` + nested option blocks. + +Mirrors MVP spec §3 exactly. Dropped spec fields (e.g. ``use_vision``, +``reasoning_effort``, ``version``) are intentionally absent from this module; +if a caller sends one ``RequestIX`` rejects the payload (``extra="forbid"``). + +The file-URL union keeps plain strings as ``str`` and dict entries as +:class:`FileRef` so callers can mix auth-required URLs with plain ones in a +single list without wrapping every entry. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field + + +class FileRef(BaseModel): + """A file entry with optional auth headers and per-file size override. + + Used when the file URL needs authentication (e.g. Paperless ``Token``) or a + tighter size cap than :envvar:`IX_FILE_MAX_BYTES`. Plain URLs that need no + headers can stay as bare ``str`` values in :attr:`Context.files`. + """ + + model_config = ConfigDict(extra="forbid") + + url: str + headers: dict[str, str] = Field(default_factory=dict) + max_bytes: int | None = None + + +class Context(BaseModel): + """Document payload: files to fetch + OCR + any pre-extracted texts.""" + + model_config = ConfigDict(extra="forbid") + + files: list[str | FileRef] = Field(default_factory=list) + texts: list[str] = Field(default_factory=list) + + +class OCROptions(BaseModel): + """Knobs for the OCR step. + + ``service`` is kept so the adapter point stays visible in the contract even + though MVP only wires Surya. Future engines plug in behind the same name. + """ + + model_config = ConfigDict(extra="forbid") + + use_ocr: bool = True + ocr_only: bool = False + include_ocr_text: bool = False + include_geometries: bool = False + service: Literal["surya"] = "surya" + + +class GenAIOptions(BaseModel): + """Knobs for the LLM step.""" + + model_config = ConfigDict(extra="forbid") + + gen_ai_model_name: str | None = None + + +class ProvenanceOptions(BaseModel): + """Knobs for the provenance + reliability steps.""" + + model_config = ConfigDict(extra="forbid") + + include_provenance: bool = True + max_sources_per_field: int = 10 + + +class Options(BaseModel): + """Aggregate options block nested into :class:`RequestIX`.""" + + model_config = ConfigDict(extra="forbid") + + ocr: OCROptions = Field(default_factory=OCROptions) + gen_ai: GenAIOptions = Field(default_factory=GenAIOptions) + provenance: ProvenanceOptions = Field(default_factory=ProvenanceOptions) + + +class RequestIX(BaseModel): + """Top-level job request. + + ``ix_id`` is a transport-assigned 16-char hex handle. Callers MUST NOT set + it; the REST adapter / pg-queue adapter populates it on insert. The field + is kept here so the contract is closed-over-construction round-trips + (e.g. when the worker re-hydrates a job out of the store). + """ + + model_config = ConfigDict(extra="forbid") + + use_case: str + ix_client_id: str + request_id: str + ix_id: str | None = None + context: Context + options: Options = Field(default_factory=Options) + callback_url: str | None = None diff --git a/tests/unit/test_contracts.py b/tests/unit/test_contracts.py new file mode 100644 index 0000000..d1b94e0 --- /dev/null +++ b/tests/unit/test_contracts.py @@ -0,0 +1,168 @@ +"""Pydantic contracts — RequestIX and its nested option structures (spec §3).""" + +from __future__ import annotations + +import json + +import pytest +from pydantic import ValidationError + +from ix.contracts import ( + Context, + FileRef, + GenAIOptions, + OCROptions, + Options, + ProvenanceOptions, + RequestIX, +) + + +class TestFileRef: + def test_minimal(self) -> None: + fr = FileRef(url="https://example.com/x.pdf") + assert fr.url == "https://example.com/x.pdf" + assert fr.headers == {} + assert fr.max_bytes is None + + def test_with_headers_and_max_bytes(self) -> None: + fr = FileRef( + url="https://paperless/x.pdf", + headers={"Authorization": "Token abc"}, + max_bytes=1_000_000, + ) + assert fr.headers == {"Authorization": "Token abc"} + assert fr.max_bytes == 1_000_000 + + +class TestOptionDefaults: + def test_ocr_defaults_match_spec(self) -> None: + o = OCROptions() + assert o.use_ocr is True + assert o.ocr_only is False + assert o.include_ocr_text is False + assert o.include_geometries is False + assert o.service == "surya" + + def test_genai_defaults_match_spec(self) -> None: + g = GenAIOptions() + assert g.gen_ai_model_name is None + + def test_provenance_defaults_match_spec(self) -> None: + p = ProvenanceOptions() + assert p.include_provenance is True + assert p.max_sources_per_field == 10 + + def test_options_default_nests_each_block(self) -> None: + opts = Options() + assert isinstance(opts.ocr, OCROptions) + assert isinstance(opts.gen_ai, GenAIOptions) + assert isinstance(opts.provenance, ProvenanceOptions) + + +class TestContextFiles: + def test_plain_string_entry_preserved_as_str(self) -> None: + ctx = Context(files=["https://example.com/a.pdf"]) + assert ctx.files == ["https://example.com/a.pdf"] + assert isinstance(ctx.files[0], str) + + def test_dict_entry_parsed_as_fileref(self) -> None: + ctx = Context(files=[{"url": "https://x/a.pdf", "headers": {"H": "v"}}]) + assert len(ctx.files) == 1 + entry = ctx.files[0] + assert isinstance(entry, FileRef) + assert entry.url == "https://x/a.pdf" + assert entry.headers == {"H": "v"} + + def test_mixed_entries(self) -> None: + ctx = Context( + files=[ + "file:///tmp/x.pdf", + {"url": "https://paperless/y.pdf", "headers": {"Authorization": "Token t"}}, + ], + texts=["extra ocr text"], + ) + assert isinstance(ctx.files[0], str) + assert isinstance(ctx.files[1], FileRef) + assert ctx.texts == ["extra ocr text"] + + def test_empty_defaults(self) -> None: + ctx = Context() + assert ctx.files == [] + assert ctx.texts == [] + + +class TestRequestIX: + def _minimal_payload(self) -> dict: + return { + "use_case": "bank_statement_header", + "ix_client_id": "mammon", + "request_id": "req-1", + "context": {"files": ["https://example/x.pdf"]}, + } + + def test_minimal_valid(self) -> None: + r = RequestIX(**self._minimal_payload()) + assert r.use_case == "bank_statement_header" + assert r.ix_id is None + assert r.callback_url is None + assert r.options.provenance.include_provenance is True + + def test_roundtrip_json_mixed_files(self) -> None: + payload = { + "use_case": "bank_statement_header", + "ix_client_id": "mammon", + "request_id": "req-42", + "context": { + "files": [ + "file:///tmp/x.pdf", + { + "url": "https://paperless/y.pdf", + "headers": {"Authorization": "Token t"}, + "max_bytes": 2_000_000, + }, + ], + "texts": ["paperless ocr text"], + }, + "options": { + "ocr": {"include_ocr_text": True}, + "gen_ai": {"gen_ai_model_name": "gpt-oss:20b"}, + "provenance": {"max_sources_per_field": 5}, + }, + "callback_url": "https://mammon/ix-callback", + } + r = RequestIX.model_validate(payload) + assert isinstance(r.context.files[0], str) + assert isinstance(r.context.files[1], FileRef) + assert r.context.files[1].headers == {"Authorization": "Token t"} + assert r.options.ocr.include_ocr_text is True + assert r.options.gen_ai.gen_ai_model_name == "gpt-oss:20b" + assert r.options.provenance.max_sources_per_field == 5 + assert r.callback_url == "https://mammon/ix-callback" + + # Round-trip through JSON and back: FileRef dicts survive as FileRef. + dumped = r.model_dump_json() + r2 = RequestIX.model_validate_json(dumped) + assert isinstance(r2.context.files[1], FileRef) + assert r2.context.files[1].headers == {"Authorization": "Token t"} + + # dumped JSON is valid JSON + json.loads(dumped) + + def test_unknown_fields_rejected(self) -> None: + payload = self._minimal_payload() + payload["not_a_field"] = "x" + with pytest.raises(ValidationError): + RequestIX.model_validate(payload) + + def test_ix_id_optional_and_documented(self) -> None: + # The docstring contract: caller MUST NOT set; transport assigns. + # Here we only assert the field exists and defaults to None — the + # "MUST NOT set" is a convention enforced at the transport layer. + r = RequestIX(**self._minimal_payload()) + assert r.ix_id is None + assert "transport" in RequestIX.__doc__.lower() or "MUST NOT" in (RequestIX.__doc__ or "") + + def test_missing_required_fields(self) -> None: + with pytest.raises(ValidationError): + RequestIX.model_validate({"use_case": "x"})