Compare commits
No commits in common. "599021817204225d0babbeed7cc97688107ffa85" and "ebdba99d9f4fb01cfa3f6aca6192a294abdccff1" have entirely different histories.
5990218172
...
ebdba99d9f
3 changed files with 0 additions and 298 deletions
|
|
@ -1,27 +0,0 @@
|
||||||
"""Pydantic v2 data contracts shared by the pipeline, adapters, and store.
|
|
||||||
|
|
||||||
Re-exports the public symbols from sibling modules so call sites can write
|
|
||||||
``from ix.contracts import RequestIX`` without chasing the submodule layout.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from ix.contracts.request import (
|
|
||||||
Context,
|
|
||||||
FileRef,
|
|
||||||
GenAIOptions,
|
|
||||||
OCROptions,
|
|
||||||
Options,
|
|
||||||
ProvenanceOptions,
|
|
||||||
RequestIX,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"Context",
|
|
||||||
"FileRef",
|
|
||||||
"GenAIOptions",
|
|
||||||
"OCROptions",
|
|
||||||
"Options",
|
|
||||||
"ProvenanceOptions",
|
|
||||||
"RequestIX",
|
|
||||||
]
|
|
||||||
|
|
@ -1,103 +0,0 @@
|
||||||
"""Incoming request contracts — :class:`RequestIX` + nested option blocks.
|
|
||||||
|
|
||||||
Mirrors MVP spec §3 exactly. Dropped spec fields (e.g. ``use_vision``,
|
|
||||||
``reasoning_effort``, ``version``) are intentionally absent from this module;
|
|
||||||
if a caller sends one ``RequestIX`` rejects the payload (``extra="forbid"``).
|
|
||||||
|
|
||||||
The file-URL union keeps plain strings as ``str`` and dict entries as
|
|
||||||
:class:`FileRef` so callers can mix auth-required URLs with plain ones in a
|
|
||||||
single list without wrapping every entry.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Literal
|
|
||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, Field
|
|
||||||
|
|
||||||
|
|
||||||
class FileRef(BaseModel):
|
|
||||||
"""A file entry with optional auth headers and per-file size override.
|
|
||||||
|
|
||||||
Used when the file URL needs authentication (e.g. Paperless ``Token``) or a
|
|
||||||
tighter size cap than :envvar:`IX_FILE_MAX_BYTES`. Plain URLs that need no
|
|
||||||
headers can stay as bare ``str`` values in :attr:`Context.files`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
url: str
|
|
||||||
headers: dict[str, str] = Field(default_factory=dict)
|
|
||||||
max_bytes: int | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class Context(BaseModel):
|
|
||||||
"""Document payload: files to fetch + OCR + any pre-extracted texts."""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
files: list[str | FileRef] = Field(default_factory=list)
|
|
||||||
texts: list[str] = Field(default_factory=list)
|
|
||||||
|
|
||||||
|
|
||||||
class OCROptions(BaseModel):
|
|
||||||
"""Knobs for the OCR step.
|
|
||||||
|
|
||||||
``service`` is kept so the adapter point stays visible in the contract even
|
|
||||||
though MVP only wires Surya. Future engines plug in behind the same name.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
use_ocr: bool = True
|
|
||||||
ocr_only: bool = False
|
|
||||||
include_ocr_text: bool = False
|
|
||||||
include_geometries: bool = False
|
|
||||||
service: Literal["surya"] = "surya"
|
|
||||||
|
|
||||||
|
|
||||||
class GenAIOptions(BaseModel):
|
|
||||||
"""Knobs for the LLM step."""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
gen_ai_model_name: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class ProvenanceOptions(BaseModel):
|
|
||||||
"""Knobs for the provenance + reliability steps."""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
include_provenance: bool = True
|
|
||||||
max_sources_per_field: int = 10
|
|
||||||
|
|
||||||
|
|
||||||
class Options(BaseModel):
|
|
||||||
"""Aggregate options block nested into :class:`RequestIX`."""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
ocr: OCROptions = Field(default_factory=OCROptions)
|
|
||||||
gen_ai: GenAIOptions = Field(default_factory=GenAIOptions)
|
|
||||||
provenance: ProvenanceOptions = Field(default_factory=ProvenanceOptions)
|
|
||||||
|
|
||||||
|
|
||||||
class RequestIX(BaseModel):
|
|
||||||
"""Top-level job request.
|
|
||||||
|
|
||||||
``ix_id`` is a transport-assigned 16-char hex handle. Callers MUST NOT set
|
|
||||||
it; the REST adapter / pg-queue adapter populates it on insert. The field
|
|
||||||
is kept here so the contract is closed-over-construction round-trips
|
|
||||||
(e.g. when the worker re-hydrates a job out of the store).
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid")
|
|
||||||
|
|
||||||
use_case: str
|
|
||||||
ix_client_id: str
|
|
||||||
request_id: str
|
|
||||||
ix_id: str | None = None
|
|
||||||
context: Context
|
|
||||||
options: Options = Field(default_factory=Options)
|
|
||||||
callback_url: str | None = None
|
|
||||||
|
|
@ -1,168 +0,0 @@
|
||||||
"""Pydantic contracts — RequestIX and its nested option structures (spec §3)."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from ix.contracts import (
|
|
||||||
Context,
|
|
||||||
FileRef,
|
|
||||||
GenAIOptions,
|
|
||||||
OCROptions,
|
|
||||||
Options,
|
|
||||||
ProvenanceOptions,
|
|
||||||
RequestIX,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileRef:
|
|
||||||
def test_minimal(self) -> None:
|
|
||||||
fr = FileRef(url="https://example.com/x.pdf")
|
|
||||||
assert fr.url == "https://example.com/x.pdf"
|
|
||||||
assert fr.headers == {}
|
|
||||||
assert fr.max_bytes is None
|
|
||||||
|
|
||||||
def test_with_headers_and_max_bytes(self) -> None:
|
|
||||||
fr = FileRef(
|
|
||||||
url="https://paperless/x.pdf",
|
|
||||||
headers={"Authorization": "Token abc"},
|
|
||||||
max_bytes=1_000_000,
|
|
||||||
)
|
|
||||||
assert fr.headers == {"Authorization": "Token abc"}
|
|
||||||
assert fr.max_bytes == 1_000_000
|
|
||||||
|
|
||||||
|
|
||||||
class TestOptionDefaults:
|
|
||||||
def test_ocr_defaults_match_spec(self) -> None:
|
|
||||||
o = OCROptions()
|
|
||||||
assert o.use_ocr is True
|
|
||||||
assert o.ocr_only is False
|
|
||||||
assert o.include_ocr_text is False
|
|
||||||
assert o.include_geometries is False
|
|
||||||
assert o.service == "surya"
|
|
||||||
|
|
||||||
def test_genai_defaults_match_spec(self) -> None:
|
|
||||||
g = GenAIOptions()
|
|
||||||
assert g.gen_ai_model_name is None
|
|
||||||
|
|
||||||
def test_provenance_defaults_match_spec(self) -> None:
|
|
||||||
p = ProvenanceOptions()
|
|
||||||
assert p.include_provenance is True
|
|
||||||
assert p.max_sources_per_field == 10
|
|
||||||
|
|
||||||
def test_options_default_nests_each_block(self) -> None:
|
|
||||||
opts = Options()
|
|
||||||
assert isinstance(opts.ocr, OCROptions)
|
|
||||||
assert isinstance(opts.gen_ai, GenAIOptions)
|
|
||||||
assert isinstance(opts.provenance, ProvenanceOptions)
|
|
||||||
|
|
||||||
|
|
||||||
class TestContextFiles:
|
|
||||||
def test_plain_string_entry_preserved_as_str(self) -> None:
|
|
||||||
ctx = Context(files=["https://example.com/a.pdf"])
|
|
||||||
assert ctx.files == ["https://example.com/a.pdf"]
|
|
||||||
assert isinstance(ctx.files[0], str)
|
|
||||||
|
|
||||||
def test_dict_entry_parsed_as_fileref(self) -> None:
|
|
||||||
ctx = Context(files=[{"url": "https://x/a.pdf", "headers": {"H": "v"}}])
|
|
||||||
assert len(ctx.files) == 1
|
|
||||||
entry = ctx.files[0]
|
|
||||||
assert isinstance(entry, FileRef)
|
|
||||||
assert entry.url == "https://x/a.pdf"
|
|
||||||
assert entry.headers == {"H": "v"}
|
|
||||||
|
|
||||||
def test_mixed_entries(self) -> None:
|
|
||||||
ctx = Context(
|
|
||||||
files=[
|
|
||||||
"file:///tmp/x.pdf",
|
|
||||||
{"url": "https://paperless/y.pdf", "headers": {"Authorization": "Token t"}},
|
|
||||||
],
|
|
||||||
texts=["extra ocr text"],
|
|
||||||
)
|
|
||||||
assert isinstance(ctx.files[0], str)
|
|
||||||
assert isinstance(ctx.files[1], FileRef)
|
|
||||||
assert ctx.texts == ["extra ocr text"]
|
|
||||||
|
|
||||||
def test_empty_defaults(self) -> None:
|
|
||||||
ctx = Context()
|
|
||||||
assert ctx.files == []
|
|
||||||
assert ctx.texts == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestRequestIX:
|
|
||||||
def _minimal_payload(self) -> dict:
|
|
||||||
return {
|
|
||||||
"use_case": "bank_statement_header",
|
|
||||||
"ix_client_id": "mammon",
|
|
||||||
"request_id": "req-1",
|
|
||||||
"context": {"files": ["https://example/x.pdf"]},
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_minimal_valid(self) -> None:
|
|
||||||
r = RequestIX(**self._minimal_payload())
|
|
||||||
assert r.use_case == "bank_statement_header"
|
|
||||||
assert r.ix_id is None
|
|
||||||
assert r.callback_url is None
|
|
||||||
assert r.options.provenance.include_provenance is True
|
|
||||||
|
|
||||||
def test_roundtrip_json_mixed_files(self) -> None:
|
|
||||||
payload = {
|
|
||||||
"use_case": "bank_statement_header",
|
|
||||||
"ix_client_id": "mammon",
|
|
||||||
"request_id": "req-42",
|
|
||||||
"context": {
|
|
||||||
"files": [
|
|
||||||
"file:///tmp/x.pdf",
|
|
||||||
{
|
|
||||||
"url": "https://paperless/y.pdf",
|
|
||||||
"headers": {"Authorization": "Token t"},
|
|
||||||
"max_bytes": 2_000_000,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"texts": ["paperless ocr text"],
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"ocr": {"include_ocr_text": True},
|
|
||||||
"gen_ai": {"gen_ai_model_name": "gpt-oss:20b"},
|
|
||||||
"provenance": {"max_sources_per_field": 5},
|
|
||||||
},
|
|
||||||
"callback_url": "https://mammon/ix-callback",
|
|
||||||
}
|
|
||||||
r = RequestIX.model_validate(payload)
|
|
||||||
assert isinstance(r.context.files[0], str)
|
|
||||||
assert isinstance(r.context.files[1], FileRef)
|
|
||||||
assert r.context.files[1].headers == {"Authorization": "Token t"}
|
|
||||||
assert r.options.ocr.include_ocr_text is True
|
|
||||||
assert r.options.gen_ai.gen_ai_model_name == "gpt-oss:20b"
|
|
||||||
assert r.options.provenance.max_sources_per_field == 5
|
|
||||||
assert r.callback_url == "https://mammon/ix-callback"
|
|
||||||
|
|
||||||
# Round-trip through JSON and back: FileRef dicts survive as FileRef.
|
|
||||||
dumped = r.model_dump_json()
|
|
||||||
r2 = RequestIX.model_validate_json(dumped)
|
|
||||||
assert isinstance(r2.context.files[1], FileRef)
|
|
||||||
assert r2.context.files[1].headers == {"Authorization": "Token t"}
|
|
||||||
|
|
||||||
# dumped JSON is valid JSON
|
|
||||||
json.loads(dumped)
|
|
||||||
|
|
||||||
def test_unknown_fields_rejected(self) -> None:
|
|
||||||
payload = self._minimal_payload()
|
|
||||||
payload["not_a_field"] = "x"
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
RequestIX.model_validate(payload)
|
|
||||||
|
|
||||||
def test_ix_id_optional_and_documented(self) -> None:
|
|
||||||
# The docstring contract: caller MUST NOT set; transport assigns.
|
|
||||||
# Here we only assert the field exists and defaults to None — the
|
|
||||||
# "MUST NOT set" is a convention enforced at the transport layer.
|
|
||||||
r = RequestIX(**self._minimal_payload())
|
|
||||||
assert r.ix_id is None
|
|
||||||
assert "transport" in RequestIX.__doc__.lower() or "MUST NOT" in (RequestIX.__doc__ or "")
|
|
||||||
|
|
||||||
def test_missing_required_fields(self) -> None:
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
RequestIX.model_validate({"use_case": "x"})
|
|
||||||
Loading…
Reference in a new issue