Lands the two remaining provenance-subsystem pieces:
mapper.py — map_segment_refs_to_provenance:
- For each LLM SegmentCitation, pick seg-ids per source_type
(`value` vs `value_and_context`), cap at max_sources_per_field,
resolve each via SegmentIndex, track invalid references.
- Resolve field values by dot-path (`result.items[0].name` supported —
`[N]` bracket notation is normalised to `.N` before traversal).
- Skip fields that resolve to zero valid sources (spec §9.4).
- Write quality_metrics with fields_with_provenance / total_fields /
coverage_rate / invalid_references.
verify.py — verify_field + apply_reliability_flags:
- Dispatches per Pydantic field type: date → parse-both-sides compare;
int/float/Decimal → normalize + whole-snippet / numeric-token scan;
IBAN (detected via `iban` in field name) → upper+strip compare;
Literal / None → flags stay None; else string substring.
- _unwrap_optional handles BOTH typing.Union AND types.UnionType so
`Decimal | None` (PEP 604, what get_type_hints emits on 3.12+) resolves
correctly — caught by the integration-style test_writes_flags_and_counters.
- Number comparator scans numeric tokens in the snippet so labels
("Closing balance CHF 1'234.56") don't mask the match.
- apply_reliability_flags mutates the passed ProvenanceData in place and
writes verified_fields / text_agreement_fields to quality_metrics.
Tests cover each comparator, Literal/None skip, short-value skip (strings
and numerics), Decimal via optional union, and end-to-end flag+counter
writing against a Pydantic use-case schema that mirrors bank_statement_header.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
206 lines
7.6 KiB
Python
206 lines
7.6 KiB
Python
"""Tests for the provenance mapper (spec §9.4)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ix.contracts import (
|
|
BoundingBox,
|
|
Line,
|
|
OCRDetails,
|
|
OCRResult,
|
|
Page,
|
|
SegmentCitation,
|
|
)
|
|
from ix.provenance.mapper import (
|
|
map_segment_refs_to_provenance,
|
|
resolve_nested_path,
|
|
)
|
|
from ix.segmentation import PageMetadata, SegmentIndex
|
|
|
|
|
|
def _make_index_with_lines(lines: list[tuple[str, int]]) -> SegmentIndex:
|
|
"""Build a tiny index where each line has a known text + file_index.
|
|
|
|
Each entry is (text, file_index); all entries go on a single page.
|
|
"""
|
|
ocr_lines = [Line(text=t, bounding_box=[0, 0, 10, 0, 10, 5, 0, 5]) for t, _ in lines]
|
|
page = Page(page_no=1, width=100.0, height=200.0, lines=ocr_lines)
|
|
ocr = OCRResult(result=OCRDetails(pages=[page]))
|
|
# file_index for the whole page — the test uses a single page.
|
|
file_index = lines[0][1] if lines else 0
|
|
return SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=[PageMetadata(file_index=file_index)],
|
|
)
|
|
|
|
|
|
class TestResolveNestedPath:
|
|
def test_simple_path(self) -> None:
|
|
assert resolve_nested_path({"result": {"a": "x"}}, "result.a") == "x"
|
|
|
|
def test_nested_path(self) -> None:
|
|
data = {"result": {"header": {"bank": "UBS"}}}
|
|
assert resolve_nested_path(data, "result.header.bank") == "UBS"
|
|
|
|
def test_missing_path_returns_none(self) -> None:
|
|
assert resolve_nested_path({"result": {}}, "result.nope") is None
|
|
|
|
def test_array_bracket_notation_normalised(self) -> None:
|
|
data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
|
|
assert resolve_nested_path(data, "result.items[0].name") == "a"
|
|
assert resolve_nested_path(data, "result.items[1].name") == "b"
|
|
|
|
def test_array_dot_notation(self) -> None:
|
|
data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
|
|
assert resolve_nested_path(data, "result.items.0.name") == "a"
|
|
|
|
|
|
class TestMapper:
|
|
def test_simple_single_field(self) -> None:
|
|
idx = _make_index_with_lines([("UBS AG", 0), ("Header text", 0)])
|
|
extraction = {"result": {"bank_name": "UBS AG"}}
|
|
citations = [
|
|
SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
|
|
]
|
|
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result=extraction,
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value_and_context",
|
|
)
|
|
|
|
fp = prov.fields["result.bank_name"]
|
|
assert fp.field_name == "bank_name"
|
|
assert fp.value == "UBS AG"
|
|
assert len(fp.sources) == 1
|
|
src = fp.sources[0]
|
|
assert src.segment_id == "p1_l0"
|
|
assert src.text_snippet == "UBS AG"
|
|
assert src.page_number == 1
|
|
assert src.file_index == 0
|
|
assert isinstance(src.bounding_box, BoundingBox)
|
|
# quality_metrics populated
|
|
assert prov.quality_metrics["invalid_references"] == 0
|
|
|
|
def test_invalid_reference_counted(self) -> None:
|
|
idx = _make_index_with_lines([("UBS AG", 0)])
|
|
extraction = {"result": {"bank_name": "UBS AG"}}
|
|
citations = [
|
|
SegmentCitation(
|
|
field_path="result.bank_name",
|
|
value_segment_ids=["p1_l0", "p9_l9"], # p9_l9 doesn't exist
|
|
)
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result=extraction,
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value_and_context",
|
|
)
|
|
assert prov.quality_metrics["invalid_references"] == 1
|
|
# The one valid source still populated.
|
|
assert len(prov.fields["result.bank_name"].sources) == 1
|
|
|
|
def test_max_sources_cap(self) -> None:
|
|
# Five lines; ask for a cap of 2.
|
|
idx = _make_index_with_lines([(f"line {i}", 0) for i in range(5)])
|
|
citations = [
|
|
SegmentCitation(
|
|
field_path="result.notes",
|
|
value_segment_ids=[f"p1_l{i}" for i in range(5)],
|
|
)
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result={"result": {"notes": "noise"}},
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=2,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value_and_context",
|
|
)
|
|
assert len(prov.fields["result.notes"].sources) == 2
|
|
|
|
def test_source_type_value_only(self) -> None:
|
|
idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
|
|
citations = [
|
|
SegmentCitation(
|
|
field_path="result.bank_name",
|
|
value_segment_ids=["p1_l1"],
|
|
context_segment_ids=["p1_l0"],
|
|
)
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result={"result": {"bank_name": "UBS AG"}},
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value",
|
|
)
|
|
sources = prov.fields["result.bank_name"].sources
|
|
# Only value_segment_ids included.
|
|
assert [s.segment_id for s in sources] == ["p1_l1"]
|
|
|
|
def test_source_type_value_and_context(self) -> None:
|
|
idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
|
|
citations = [
|
|
SegmentCitation(
|
|
field_path="result.bank_name",
|
|
value_segment_ids=["p1_l1"],
|
|
context_segment_ids=["p1_l0"],
|
|
)
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result={"result": {"bank_name": "UBS AG"}},
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value_and_context",
|
|
)
|
|
sources = prov.fields["result.bank_name"].sources
|
|
assert [s.segment_id for s in sources] == ["p1_l1", "p1_l0"]
|
|
|
|
def test_include_bounding_boxes_false(self) -> None:
|
|
idx = _make_index_with_lines([("UBS AG", 0)])
|
|
citations = [
|
|
SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result={"result": {"bank_name": "UBS AG"}},
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=False,
|
|
source_type="value_and_context",
|
|
)
|
|
assert prov.fields["result.bank_name"].sources[0].bounding_box is None
|
|
|
|
def test_field_with_no_valid_sources_skipped(self) -> None:
|
|
idx = _make_index_with_lines([("UBS", 0)])
|
|
citations = [
|
|
SegmentCitation(field_path="result.ghost", value_segment_ids=["p9_l9"])
|
|
]
|
|
prov = map_segment_refs_to_provenance(
|
|
extraction_result={"result": {"ghost": "x"}},
|
|
segment_citations=citations,
|
|
segment_index=idx,
|
|
max_sources_per_field=10,
|
|
min_confidence=0.0,
|
|
include_bounding_boxes=True,
|
|
source_type="value_and_context",
|
|
)
|
|
# Field not added when zero valid sources (spec §9.4 step).
|
|
assert "result.ghost" not in prov.fields
|
|
assert prov.quality_metrics["invalid_references"] == 1
|