"""Tests for the provenance mapper (spec §9.4).""" from __future__ import annotations from ix.contracts import ( BoundingBox, Line, OCRDetails, OCRResult, Page, SegmentCitation, ) from ix.provenance.mapper import ( map_segment_refs_to_provenance, resolve_nested_path, ) from ix.segmentation import PageMetadata, SegmentIndex def _make_index_with_lines(lines: list[tuple[str, int]]) -> SegmentIndex: """Build a tiny index where each line has a known text + file_index. Each entry is (text, file_index); all entries go on a single page. """ ocr_lines = [Line(text=t, bounding_box=[0, 0, 10, 0, 10, 5, 0, 5]) for t, _ in lines] page = Page(page_no=1, width=100.0, height=200.0, lines=ocr_lines) ocr = OCRResult(result=OCRDetails(pages=[page])) # file_index for the whole page — the test uses a single page. file_index = lines[0][1] if lines else 0 return SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=[PageMetadata(file_index=file_index)], ) class TestResolveNestedPath: def test_simple_path(self) -> None: assert resolve_nested_path({"result": {"a": "x"}}, "result.a") == "x" def test_nested_path(self) -> None: data = {"result": {"header": {"bank": "UBS"}}} assert resolve_nested_path(data, "result.header.bank") == "UBS" def test_missing_path_returns_none(self) -> None: assert resolve_nested_path({"result": {}}, "result.nope") is None def test_array_bracket_notation_normalised(self) -> None: data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}} assert resolve_nested_path(data, "result.items[0].name") == "a" assert resolve_nested_path(data, "result.items[1].name") == "b" def test_array_dot_notation(self) -> None: data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}} assert resolve_nested_path(data, "result.items.0.name") == "a" class TestMapper: def test_simple_single_field(self) -> None: idx = _make_index_with_lines([("UBS AG", 0), ("Header text", 0)]) extraction = {"result": {"bank_name": "UBS AG"}} citations = [ SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"]) ] prov = map_segment_refs_to_provenance( extraction_result=extraction, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=True, source_type="value_and_context", ) fp = prov.fields["result.bank_name"] assert fp.field_name == "bank_name" assert fp.value == "UBS AG" assert len(fp.sources) == 1 src = fp.sources[0] assert src.segment_id == "p1_l0" assert src.text_snippet == "UBS AG" assert src.page_number == 1 assert src.file_index == 0 assert isinstance(src.bounding_box, BoundingBox) # quality_metrics populated assert prov.quality_metrics["invalid_references"] == 0 def test_invalid_reference_counted(self) -> None: idx = _make_index_with_lines([("UBS AG", 0)]) extraction = {"result": {"bank_name": "UBS AG"}} citations = [ SegmentCitation( field_path="result.bank_name", value_segment_ids=["p1_l0", "p9_l9"], # p9_l9 doesn't exist ) ] prov = map_segment_refs_to_provenance( extraction_result=extraction, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=True, source_type="value_and_context", ) assert prov.quality_metrics["invalid_references"] == 1 # The one valid source still populated. assert len(prov.fields["result.bank_name"].sources) == 1 def test_max_sources_cap(self) -> None: # Five lines; ask for a cap of 2. idx = _make_index_with_lines([(f"line {i}", 0) for i in range(5)]) citations = [ SegmentCitation( field_path="result.notes", value_segment_ids=[f"p1_l{i}" for i in range(5)], ) ] prov = map_segment_refs_to_provenance( extraction_result={"result": {"notes": "noise"}}, segment_citations=citations, segment_index=idx, max_sources_per_field=2, min_confidence=0.0, include_bounding_boxes=True, source_type="value_and_context", ) assert len(prov.fields["result.notes"].sources) == 2 def test_source_type_value_only(self) -> None: idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)]) citations = [ SegmentCitation( field_path="result.bank_name", value_segment_ids=["p1_l1"], context_segment_ids=["p1_l0"], ) ] prov = map_segment_refs_to_provenance( extraction_result={"result": {"bank_name": "UBS AG"}}, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=True, source_type="value", ) sources = prov.fields["result.bank_name"].sources # Only value_segment_ids included. assert [s.segment_id for s in sources] == ["p1_l1"] def test_source_type_value_and_context(self) -> None: idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)]) citations = [ SegmentCitation( field_path="result.bank_name", value_segment_ids=["p1_l1"], context_segment_ids=["p1_l0"], ) ] prov = map_segment_refs_to_provenance( extraction_result={"result": {"bank_name": "UBS AG"}}, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=True, source_type="value_and_context", ) sources = prov.fields["result.bank_name"].sources assert [s.segment_id for s in sources] == ["p1_l1", "p1_l0"] def test_include_bounding_boxes_false(self) -> None: idx = _make_index_with_lines([("UBS AG", 0)]) citations = [ SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"]) ] prov = map_segment_refs_to_provenance( extraction_result={"result": {"bank_name": "UBS AG"}}, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=False, source_type="value_and_context", ) assert prov.fields["result.bank_name"].sources[0].bounding_box is None def test_field_with_no_valid_sources_skipped(self) -> None: idx = _make_index_with_lines([("UBS", 0)]) citations = [ SegmentCitation(field_path="result.ghost", value_segment_ids=["p9_l9"]) ] prov = map_segment_refs_to_provenance( extraction_result={"result": {"ghost": "x"}}, segment_citations=citations, segment_index=idx, max_sources_per_field=10, min_confidence=0.0, include_bounding_boxes=True, source_type="value_and_context", ) # Field not added when zero valid sources (spec §9.4 step). assert "result.ghost" not in prov.fields assert prov.quality_metrics["invalid_references"] == 1