infoxtractor/tests/unit/test_provenance_mapper.py
Dirk Riemann 1e340c82fa
All checks were successful
tests / test (pull_request) Successful in 1m10s
tests / test (push) Successful in 1m11s
feat(provenance): mapper + verifier for ReliabilityStep (spec §9.4, §6)
Lands the two remaining provenance-subsystem pieces:

mapper.py — map_segment_refs_to_provenance:
- For each LLM SegmentCitation, pick seg-ids per source_type
  (`value` vs `value_and_context`), cap at max_sources_per_field,
  resolve each via SegmentIndex, track invalid references.
- Resolve field values by dot-path (`result.items[0].name` supported —
  `[N]` bracket notation is normalised to `.N` before traversal).
- Skip fields that resolve to zero valid sources (spec §9.4).
- Write quality_metrics with fields_with_provenance / total_fields /
  coverage_rate / invalid_references.

verify.py — verify_field + apply_reliability_flags:
- Dispatches per Pydantic field type: date → parse-both-sides compare;
  int/float/Decimal → normalize + whole-snippet / numeric-token scan;
  IBAN (detected via `iban` in field name) → upper+strip compare;
  Literal / None → flags stay None; else string substring.
- _unwrap_optional handles BOTH typing.Union AND types.UnionType so
  `Decimal | None` (PEP 604, what get_type_hints emits on 3.12+) resolves
  correctly — caught by the integration-style test_writes_flags_and_counters.
- Number comparator scans numeric tokens in the snippet so labels
  ("Closing balance CHF 1'234.56") don't mask the match.
- apply_reliability_flags mutates the passed ProvenanceData in place and
  writes verified_fields / text_agreement_fields to quality_metrics.

Tests cover each comparator, Literal/None skip, short-value skip (strings
and numerics), Decimal via optional union, and end-to-end flag+counter
writing against a Pydantic use-case schema that mirrors bank_statement_header.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 11:01:19 +02:00

206 lines
7.6 KiB
Python

"""Tests for the provenance mapper (spec §9.4)."""
from __future__ import annotations
from ix.contracts import (
BoundingBox,
Line,
OCRDetails,
OCRResult,
Page,
SegmentCitation,
)
from ix.provenance.mapper import (
map_segment_refs_to_provenance,
resolve_nested_path,
)
from ix.segmentation import PageMetadata, SegmentIndex
def _make_index_with_lines(lines: list[tuple[str, int]]) -> SegmentIndex:
"""Build a tiny index where each line has a known text + file_index.
Each entry is (text, file_index); all entries go on a single page.
"""
ocr_lines = [Line(text=t, bounding_box=[0, 0, 10, 0, 10, 5, 0, 5]) for t, _ in lines]
page = Page(page_no=1, width=100.0, height=200.0, lines=ocr_lines)
ocr = OCRResult(result=OCRDetails(pages=[page]))
# file_index for the whole page — the test uses a single page.
file_index = lines[0][1] if lines else 0
return SegmentIndex.build(
ocr_result=ocr,
granularity="line",
pages_metadata=[PageMetadata(file_index=file_index)],
)
class TestResolveNestedPath:
def test_simple_path(self) -> None:
assert resolve_nested_path({"result": {"a": "x"}}, "result.a") == "x"
def test_nested_path(self) -> None:
data = {"result": {"header": {"bank": "UBS"}}}
assert resolve_nested_path(data, "result.header.bank") == "UBS"
def test_missing_path_returns_none(self) -> None:
assert resolve_nested_path({"result": {}}, "result.nope") is None
def test_array_bracket_notation_normalised(self) -> None:
data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
assert resolve_nested_path(data, "result.items[0].name") == "a"
assert resolve_nested_path(data, "result.items[1].name") == "b"
def test_array_dot_notation(self) -> None:
data = {"result": {"items": [{"name": "a"}, {"name": "b"}]}}
assert resolve_nested_path(data, "result.items.0.name") == "a"
class TestMapper:
def test_simple_single_field(self) -> None:
idx = _make_index_with_lines([("UBS AG", 0), ("Header text", 0)])
extraction = {"result": {"bank_name": "UBS AG"}}
citations = [
SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
]
prov = map_segment_refs_to_provenance(
extraction_result=extraction,
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value_and_context",
)
fp = prov.fields["result.bank_name"]
assert fp.field_name == "bank_name"
assert fp.value == "UBS AG"
assert len(fp.sources) == 1
src = fp.sources[0]
assert src.segment_id == "p1_l0"
assert src.text_snippet == "UBS AG"
assert src.page_number == 1
assert src.file_index == 0
assert isinstance(src.bounding_box, BoundingBox)
# quality_metrics populated
assert prov.quality_metrics["invalid_references"] == 0
def test_invalid_reference_counted(self) -> None:
idx = _make_index_with_lines([("UBS AG", 0)])
extraction = {"result": {"bank_name": "UBS AG"}}
citations = [
SegmentCitation(
field_path="result.bank_name",
value_segment_ids=["p1_l0", "p9_l9"], # p9_l9 doesn't exist
)
]
prov = map_segment_refs_to_provenance(
extraction_result=extraction,
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value_and_context",
)
assert prov.quality_metrics["invalid_references"] == 1
# The one valid source still populated.
assert len(prov.fields["result.bank_name"].sources) == 1
def test_max_sources_cap(self) -> None:
# Five lines; ask for a cap of 2.
idx = _make_index_with_lines([(f"line {i}", 0) for i in range(5)])
citations = [
SegmentCitation(
field_path="result.notes",
value_segment_ids=[f"p1_l{i}" for i in range(5)],
)
]
prov = map_segment_refs_to_provenance(
extraction_result={"result": {"notes": "noise"}},
segment_citations=citations,
segment_index=idx,
max_sources_per_field=2,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value_and_context",
)
assert len(prov.fields["result.notes"].sources) == 2
def test_source_type_value_only(self) -> None:
idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
citations = [
SegmentCitation(
field_path="result.bank_name",
value_segment_ids=["p1_l1"],
context_segment_ids=["p1_l0"],
)
]
prov = map_segment_refs_to_provenance(
extraction_result={"result": {"bank_name": "UBS AG"}},
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value",
)
sources = prov.fields["result.bank_name"].sources
# Only value_segment_ids included.
assert [s.segment_id for s in sources] == ["p1_l1"]
def test_source_type_value_and_context(self) -> None:
idx = _make_index_with_lines([("label:", 0), ("UBS AG", 0)])
citations = [
SegmentCitation(
field_path="result.bank_name",
value_segment_ids=["p1_l1"],
context_segment_ids=["p1_l0"],
)
]
prov = map_segment_refs_to_provenance(
extraction_result={"result": {"bank_name": "UBS AG"}},
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value_and_context",
)
sources = prov.fields["result.bank_name"].sources
assert [s.segment_id for s in sources] == ["p1_l1", "p1_l0"]
def test_include_bounding_boxes_false(self) -> None:
idx = _make_index_with_lines([("UBS AG", 0)])
citations = [
SegmentCitation(field_path="result.bank_name", value_segment_ids=["p1_l0"])
]
prov = map_segment_refs_to_provenance(
extraction_result={"result": {"bank_name": "UBS AG"}},
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=False,
source_type="value_and_context",
)
assert prov.fields["result.bank_name"].sources[0].bounding_box is None
def test_field_with_no_valid_sources_skipped(self) -> None:
idx = _make_index_with_lines([("UBS", 0)])
citations = [
SegmentCitation(field_path="result.ghost", value_segment_ids=["p9_l9"])
]
prov = map_segment_refs_to_provenance(
extraction_result={"result": {"ghost": "x"}},
segment_citations=citations,
segment_index=idx,
max_sources_per_field=10,
min_confidence=0.0,
include_bounding_boxes=True,
source_type="value_and_context",
)
# Field not added when zero valid sources (spec §9.4 step).
assert "result.ghost" not in prov.fields
assert prov.quality_metrics["invalid_references"] == 1