infoxtractor/tests/unit/test_provenance_mapper.py

"""Tests for the provenance mapper (spec §9.4)."""

from __future__ import annotations

from ix.contracts import (
    BoundingBox,
    Line,
    OCRDetails,
    OCRResult,
    Page,
    SegmentCitation,
)
from ix.provenance.mapper import (
    map_segment_refs_to_provenance,
    resolve_nested_path,
)
from ix.segmentation import PageMetadata, SegmentIndex


def _make_index_with_lines(lines: list[tuple[str, int]]) -> SegmentIndex:
    """Build a tiny index where each line has a known text + file_index.

    Each entry is (text, file_index); all entries go on a single page.
    """
    ocr_lines = [Line(text=t, bounding_box=[0, 0, 10, 0, 10, 5, 0, 5]) for t, _ in lines]
    page = Page(page_no=1, width=100.0, height=200.0, lines=ocr_lines)
    ocr = OCRResult(result=OCRDetails(pages=[page]))
    # file_index for the whole page — the test uses a single page.
    file_index = lines[0][1] if lines else 0
    return SegmentIndex.build(
        ocr_result=ocr,
        granularity="line",
        pages_metadata=[PageMetadata(file_index=file_index)],
    )


class TestResolveNestedPath:
    def test_simple_path(self) -> None:
        assert resolve_nested_path({"result": {"a": "x"}}, "result.a") == "x"

    def test_nested_path(self) -> None:
        data = {"result": {"header": {"bank": "UBS"}}}
        assert resolve_nested_path(data, "result.header.bank") == "UBS"

    def test_missing_path_returns_none(self) -> None:
        assert resolve_nested_path({"result": {}}, "result.nope") is None

    def test_array_bracket_notation_normalised(self) -> None:
        data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
        assert resolve_nested_path(data, "result.items[0].name") == "a"
        assert resolve_nested_path(data, "result.items[1].name") == "b"

    def test_array_dot_notation(self) -> None:
        data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
        assert resolve_nested_path(data, "result.items.0.name") == "a"


class TestMapper:
    def test_simple_single_field(self) -> None:
        idx = _make_index_with_lines([("UBS AG", 0), ("Header text", 0)])
        extraction = {"result": {"bank_name": "UBS AG"}}
        citations = [
            SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
        ]

        prov = map_segment_refs_to_provenance(
            extraction_result=extraction,
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value_and_context",
        )

        fp = prov.fields["result.bank_name"]
        assert fp.field_name == "bank_name"
        assert fp.value == "UBS AG"
        assert len(fp.sources) == 1
        src = fp.sources[0]
        assert src.segment_id == "p1_l0"
        assert src.text_snippet == "UBS AG"
        assert src.page_number == 1
        assert src.file_index == 0
        assert isinstance(src.bounding_box, BoundingBox)
        # quality_metrics populated
        assert prov.quality_metrics["invalid_references"] == 0

    def test_invalid_reference_counted(self) -> None:
        idx = _make_index_with_lines([("UBS AG", 0)])
        extraction = {"result": {"bank_name": "UBS AG"}}
        citations = [
            SegmentCitation(
                field_path="result.bank_name",
                value_segment_ids=["p1_l0", "p9_l9"],  # p9_l9 doesn't exist
            )
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result=extraction,
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value_and_context",
        )
        assert prov.quality_metrics["invalid_references"] == 1
        # The one valid source still populated.
        assert len(prov.fields["result.bank_name"].sources) == 1

    def test_max_sources_cap(self) -> None:
        # Five lines; ask for a cap of 2.
        idx = _make_index_with_lines([(f"line {i}", 0) for i in range(5)])
        citations = [
            SegmentCitation(
                field_path="result.notes",
                value_segment_ids=[f"p1_l{i}" for i in range(5)],
            )
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result={"result": {"notes": "noise"}},
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=2,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value_and_context",
        )
        assert len(prov.fields["result.notes"].sources) == 2

    def test_source_type_value_only(self) -> None:
        idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
        citations = [
            SegmentCitation(
                field_path="result.bank_name",
                value_segment_ids=["p1_l1"],
                context_segment_ids=["p1_l0"],
            )
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result={"result": {"bank_name": "UBS AG"}},
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value",
        )
        sources = prov.fields["result.bank_name"].sources
        # Only value_segment_ids included.
        assert [s.segment_id for s in sources] == ["p1_l1"]

    def test_source_type_value_and_context(self) -> None:
        idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
        citations = [
            SegmentCitation(
                field_path="result.bank_name",
                value_segment_ids=["p1_l1"],
                context_segment_ids=["p1_l0"],
            )
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result={"result": {"bank_name": "UBS AG"}},
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value_and_context",
        )
        sources = prov.fields["result.bank_name"].sources
        assert [s.segment_id for s in sources] == ["p1_l1", "p1_l0"]

    def test_include_bounding_boxes_false(self) -> None:
        idx = _make_index_with_lines([("UBS AG", 0)])
        citations = [
            SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result={"result": {"bank_name": "UBS AG"}},
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=False,
            source_type="value_and_context",
        )
        assert prov.fields["result.bank_name"].sources[0].bounding_box is None

    def test_field_with_no_valid_sources_skipped(self) -> None:
        idx = _make_index_with_lines([("UBS", 0)])
        citations = [
            SegmentCitation(field_path="result.ghost", value_segment_ids=["p9_l9"])
        ]
        prov = map_segment_refs_to_provenance(
            extraction_result={"result": {"ghost": "x"}},
            segment_citations=citations,
            segment_index=idx,
            max_sources_per_field=10,
            min_confidence=0.0,
            include_bounding_boxes=True,
            source_type="value_and_context",
        )
        # Field not added when zero valid sources (spec §9.4 step).
        assert "result.ghost" not in prov.fields
        assert prov.quality_metrics["invalid_references"] == 1