Lands the two remaining provenance-subsystem pieces:
mapper.py — map_segment_refs_to_provenance:
- For each LLM SegmentCitation, pick seg-ids per source_type
(`value` vs `value_and_context`), cap at max_sources_per_field,
resolve each via SegmentIndex, track invalid references.
- Resolve field values by dot-path (`result.items[0].name` supported —
`[N]` bracket notation is normalised to `.N` before traversal).
- Skip fields that resolve to zero valid sources (spec §9.4).
- Write quality_metrics with fields_with_provenance / total_fields /
coverage_rate / invalid_references.
verify.py — verify_field + apply_reliability_flags:
- Dispatches per Pydantic field type: date → parse-both-sides compare;
int/float/Decimal → normalize + whole-snippet / numeric-token scan;
IBAN (detected via `iban` in field name) → upper+strip compare;
Literal / None → flags stay None; else string substring.
- _unwrap_optional handles BOTH typing.Union AND types.UnionType so
`Decimal | None` (PEP 604, what get_type_hints emits on 3.12+) resolves
correctly — caught by the integration-style test_writes_flags_and_counters.
- Number comparator scans numeric tokens in the snippet so labels
("Closing balance CHF 1'234.56") don't mask the match.
- apply_reliability_flags mutates the passed ProvenanceData in place and
writes verified_fields / text_agreement_fields to quality_metrics.
Tests cover each comparator, Literal/None skip, short-value skip (strings
and numerics), Decimal via optional union, and end-to-end flag+counter
writing against a Pydantic use-case schema that mirrors bank_statement_header.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
145 lines
4.8 KiB
Python
145 lines
4.8 KiB
Python
"""Maps LLM-emitted :class:`SegmentCitation` lists to :class:`ProvenanceData`.
|
|
|
|
Implements spec §9.4. The algorithm is deliberately small:
|
|
|
|
1. For each citation, pick the seg-id list (``value`` vs. ``value_and_context``).
|
|
2. Cap at ``max_sources_per_field``.
|
|
3. Resolve each ID via :meth:`SegmentIndex.lookup_segment`; count misses.
|
|
4. Resolve the field's value by dot-path traversal of the extraction result.
|
|
5. Build a :class:`FieldProvenance`. Skip fields that resolved to zero sources.
|
|
|
|
No verification / normalisation happens here — this module's sole job is
|
|
structural assembly. :mod:`ix.provenance.verify` does the reliability pass
|
|
downstream.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any, Literal
|
|
|
|
from ix.contracts.provenance import (
|
|
ExtractionSource,
|
|
FieldProvenance,
|
|
ProvenanceData,
|
|
SegmentCitation,
|
|
)
|
|
from ix.segmentation import SegmentIndex
|
|
|
|
SourceType = Literal["value", "value_and_context"]
|
|
|
|
|
|
_BRACKET_RE = re.compile(r"\[(\d+)\]")
|
|
|
|
|
|
def resolve_nested_path(data: Any, path: str) -> Any:
|
|
"""Resolve a dot-path into ``data`` with ``[N]`` array notation normalised.
|
|
|
|
``"result.items[0].name"`` → walks ``data["result"]["items"][0]["name"]``.
|
|
Returns ``None`` at any missing-key / index-out-of-range step so callers
|
|
can fall back to recording the field with a null value.
|
|
"""
|
|
normalised = _BRACKET_RE.sub(r".\1", path)
|
|
cur: Any = data
|
|
for part in normalised.split("."):
|
|
if cur is None:
|
|
return None
|
|
if part.isdigit() and isinstance(cur, list):
|
|
i = int(part)
|
|
if i < 0 or i >= len(cur):
|
|
return None
|
|
cur = cur[i]
|
|
elif isinstance(cur, dict):
|
|
cur = cur.get(part)
|
|
else:
|
|
return None
|
|
return cur
|
|
|
|
|
|
def _segment_ids_for_citation(
|
|
citation: SegmentCitation,
|
|
source_type: SourceType,
|
|
) -> list[str]:
|
|
if source_type == "value":
|
|
return list(citation.value_segment_ids)
|
|
# value_and_context
|
|
return list(citation.value_segment_ids) + list(citation.context_segment_ids)
|
|
|
|
|
|
def map_segment_refs_to_provenance(
|
|
extraction_result: dict[str, Any],
|
|
segment_citations: list[SegmentCitation],
|
|
segment_index: SegmentIndex,
|
|
max_sources_per_field: int,
|
|
min_confidence: float, # reserved (no-op for MVP)
|
|
include_bounding_boxes: bool,
|
|
source_type: SourceType,
|
|
) -> ProvenanceData:
|
|
"""Build a :class:`ProvenanceData` from LLM citations and a SegmentIndex."""
|
|
# min_confidence is reserved for future use (see spec §2 provenance options).
|
|
_ = min_confidence
|
|
|
|
fields: dict[str, FieldProvenance] = {}
|
|
invalid_references = 0
|
|
|
|
for citation in segment_citations:
|
|
seg_ids = _segment_ids_for_citation(citation, source_type)[:max_sources_per_field]
|
|
sources: list[ExtractionSource] = []
|
|
for seg_id in seg_ids:
|
|
pos = segment_index.lookup_segment(seg_id)
|
|
if pos is None:
|
|
invalid_references += 1
|
|
continue
|
|
sources.append(
|
|
ExtractionSource(
|
|
page_number=pos["page"],
|
|
file_index=pos.get("file_index"),
|
|
bounding_box=pos["bbox"] if include_bounding_boxes else None,
|
|
text_snippet=pos["text"],
|
|
relevance_score=1.0,
|
|
segment_id=seg_id,
|
|
)
|
|
)
|
|
if not sources:
|
|
continue
|
|
|
|
value = resolve_nested_path(extraction_result, citation.field_path)
|
|
fields[citation.field_path] = FieldProvenance(
|
|
field_name=citation.field_path.split(".")[-1],
|
|
field_path=citation.field_path,
|
|
value=value,
|
|
sources=sources,
|
|
confidence=None,
|
|
)
|
|
|
|
total_fields_in_result = _count_leaf_fields(extraction_result.get("result", {}))
|
|
coverage_rate: float | None = None
|
|
if total_fields_in_result > 0:
|
|
coverage_rate = len(fields) / total_fields_in_result
|
|
|
|
return ProvenanceData(
|
|
fields=fields,
|
|
quality_metrics={
|
|
"fields_with_provenance": len(fields),
|
|
"total_fields": total_fields_in_result or None,
|
|
"coverage_rate": coverage_rate,
|
|
"invalid_references": invalid_references,
|
|
},
|
|
segment_count=len(segment_index._ordered_ids),
|
|
granularity=segment_index.granularity,
|
|
)
|
|
|
|
|
|
def _count_leaf_fields(data: Any) -> int:
|
|
"""Count non-container leaves (str/int/float/Decimal/date/bool/None) recursively."""
|
|
if data is None:
|
|
return 1
|
|
if isinstance(data, dict):
|
|
if not data:
|
|
return 0
|
|
return sum(_count_leaf_fields(v) for v in data.values())
|
|
if isinstance(data, list):
|
|
if not data:
|
|
return 0
|
|
return sum(_count_leaf_fields(v) for v in data)
|
|
return 1
|