feat(provenance): normalisers + short-value skip rule (spec §6) (#7 )

Normalizer primitives land.
feat(provenance): normalisers + short-value skip rule (spec §6)
2026-04-18 08:56:45 +00:00 · 2026-04-18 10:56:31 +02:00
3 changed files with 337 additions and 0 deletions
--- a/src/ix/provenance/init.py
+++ b/src/ix/provenance/init.py
@ -0,0 +1,32 @@
+"""Provenance subsystem — normalisers, mapper, verifier.
+
+Three pieces compose the reliability check:
+
+* :mod:`ix.provenance.normalize` — pure text/number/date/IBAN normalisers
+  used to compare OCR snippets to extracted values.
+* :mod:`ix.provenance.mapper` — resolves LLM-emitted segment IDs to
+  :class:`~ix.contracts.provenance.FieldProvenance` entries.
+* :mod:`ix.provenance.verify` — per-field-type dispatcher that writes the
+  ``provenance_verified`` / ``text_agreement`` flags.
+
+Only :mod:`normalize` is exported from the package at this step; the mapper
+and verifier land in task 1.8.
+"""
+
+from __future__ import annotations
+
+from ix.provenance.normalize import (
+    normalize_date,
+    normalize_iban,
+    normalize_number,
+    normalize_string,
+    should_skip_text_agreement,
+)
+
+__all__ = [
+    "normalize_date",
+    "normalize_iban",
+    "normalize_number",
+    "normalize_string",
+    "should_skip_text_agreement",
+]
--- a/src/ix/provenance/normalize.py
+++ b/src/ix/provenance/normalize.py
@ -0,0 +1,181 @@
+"""Pure normalisers used by the reliability check (spec §6).
+
+The ReliabilityStep compares extracted values against OCR segment snippets
+(and raw ``context.texts``) after passing both sides through the same
+normaliser. Keeping these functions pure (no IO, no state) means the
+ReliabilityStep itself can stay a thin dispatcher and every rule is
+directly unit-testable.
+
+All normalisers return ``str`` so the downstream ``substring`` / ``equals``
+comparison is trivial.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from datetime import date, datetime
+from decimal import Decimal, InvalidOperation
+from typing import Any, get_origin
+
+from dateutil import parser as _dateparser
+
+# ---------------------------------------------------------------------------
+# String
+# ---------------------------------------------------------------------------
+
+# Strip punctuation that rarely carries semantics in extracted vs. OCR compare:
+# colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question.
+_PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]")
+_WHITESPACE_RE = re.compile(r"\s+")
+
+
+def normalize_string(s: str) -> str:
+    """NFKC + casefold + punctuation strip + whitespace collapse."""
+    s = unicodedata.normalize("NFKC", s)
+    s = s.casefold()
+    s = _PUNCTUATION_RE.sub(" ", s)
+    s = _WHITESPACE_RE.sub(" ", s).strip()
+    return s
+
+
+# ---------------------------------------------------------------------------
+# Number
+# ---------------------------------------------------------------------------
+
+# Strip currency symbols / codes and everything that isn't a digit, sign,
+# apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a
+# second pass that figures out thousands-separator vs. decimal-separator from
+# structure.
+_NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]")
+
+
+def _parse_numeric_string(raw: str) -> Decimal:
+    """Heuristically decode localised numbers.
+
+    Rules:
+
+    * Strip anything that isn't a digit, sign, dot, comma, apostrophe, or
+      whitespace (this drops currency symbols / codes).
+    * Apostrophes are always thousands separators (Swiss-German style).
+    * Whitespace is always a thousands separator (fr-FR style).
+    * If both ``.`` and ``,`` appear, the rightmost is the decimal separator
+      and the other is the thousands separator.
+    * If only one of them appears: assume it's the decimal separator when it
+      has exactly 2 trailing digits, otherwise a thousands separator.
+    """
+    cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip()
+    cleaned = cleaned.replace("'", "").replace(" ", "")
+
+    has_dot = "." in cleaned
+    has_comma = "," in cleaned
+
+    if has_dot and has_comma:
+        if cleaned.rfind(".") > cleaned.rfind(","):
+            # dot is decimal
+            cleaned = cleaned.replace(",", "")
+        else:
+            # comma is decimal
+            cleaned = cleaned.replace(".", "").replace(",", ".")
+    elif has_comma:
+        # Only comma — treat as decimal if 2 digits follow, else thousands.
+        tail = cleaned.split(",")[-1]
+        if len(tail) == 2 and tail.isdigit():
+            cleaned = cleaned.replace(",", ".")
+        else:
+            cleaned = cleaned.replace(",", "")
+    elif has_dot:
+        # Only dot — same heuristic in reverse. If multiple dots appear they
+        # must be thousands separators (e.g. "1.234.567"); strip them. A
+        # single dot with a non-2-digit tail stays as-is (1.5 is 1.5).
+        tail = cleaned.split(".")[-1]
+        if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1:
+            cleaned = cleaned.replace(".", "")
+    if cleaned in ("", "+", "-"):
+        raise InvalidOperation(f"cannot parse number: {raw!r}")
+    return Decimal(cleaned)
+
+
+def normalize_number(value: int | float | Decimal | str) -> str:
+    """Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places.
+
+    Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``,
+    ``"-123.45"``) as well as native numeric types.
+    """
+    if isinstance(value, Decimal):
+        dec = value
+    elif isinstance(value, (int, float)):
+        dec = Decimal(str(value))
+    else:
+        dec = _parse_numeric_string(value)
+    # Quantize to 2dp; keep sign.
+    quantized = dec.quantize(Decimal("0.01"))
+    return format(quantized, "f")
+
+
+# ---------------------------------------------------------------------------
+# Date
+# ---------------------------------------------------------------------------
+
+
+def normalize_date(value: date | datetime | str) -> str:
+    """Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``."""
+    if isinstance(value, datetime):
+        return value.date().isoformat()
+    if isinstance(value, date):
+        return value.isoformat()
+    parsed = _dateparser.parse(value, dayfirst=True)
+    return parsed.date().isoformat()
+
+
+# ---------------------------------------------------------------------------
+# IBAN
+# ---------------------------------------------------------------------------
+
+
+def normalize_iban(s: str) -> str:
+    """Upper-case + strip all whitespace. No format validation (call site's job)."""
+    return "".join(s.split()).upper()
+
+
+# ---------------------------------------------------------------------------
+# Short-value skip rule
+# ---------------------------------------------------------------------------
+
+
+def should_skip_text_agreement(value: Any, field_type: Any) -> bool:
+    """Return True when ``text_agreement`` should be recorded as ``None``.
+
+    Rules (spec §6 ReliabilityStep):
+
+    1. ``value is None`` → skip.
+    2. ``field_type`` is a ``Literal[...]`` → skip (enum labels don't appear
+       verbatim in the source text).
+    3. Stringified value length ≤ 2 chars → skip (short strings collide with
+       random OCR noise).
+    4. Numeric value (int/float/Decimal) with ``|v| < 10`` → skip.
+
+    ``provenance_verified`` still runs for all of these — the bbox-anchored
+    cite is stronger than a global text scan for short values.
+    """
+    if value is None:
+        return True
+
+    # Literal check — Python 3.12 returns `typing.Literal` from get_origin.
+    import typing
+
+    if get_origin(field_type) is typing.Literal:
+        return True
+
+    # Numeric short-value rule — check before the stringified-length rule so
+    # that "10" (len 2) is still considered on the numeric side. Booleans
+    # are a subtype of int; we exclude them so they fall through to the
+    # string rule ("True" has len 4 so it doesn't trip anyway).
+    if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)):
+        try:
+            return abs(Decimal(str(value))) < 10
+        except InvalidOperation:
+            pass
+
+    # Stringified length rule (strings and anything not numeric).
+    return len(str(value)) <= 2
--- a/tests/unit/test_provenance_normalize.py
+++ b/tests/unit/test_provenance_normalize.py
@ -0,0 +1,124 @@
+"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
+
+from __future__ import annotations
+
+from datetime import date, datetime
+from decimal import Decimal
+from typing import Literal
+
+from ix.provenance.normalize import (
+    normalize_date,
+    normalize_iban,
+    normalize_number,
+    normalize_string,
+    should_skip_text_agreement,
+)
+
+
+class TestNormalizeString:
+    def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
+        assert normalize_string("  FOO bar!!!  ") == "foo bar"
+
+    def test_nfkc_applied_for_fullwidth(self) -> None:
+        # Fullwidth capital letters should NFKC-decompose to ASCII.
+        fullwidth_ubs = "\uff35\uff22\uff33"  # "UBS" in U+FF00 fullwidth block
+        assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
+
+    def test_whitespace_collapse(self) -> None:
+        assert normalize_string("UBS   Switzerland\tAG") == "ubs switzerland ag"
+
+    def test_strips_common_punctuation(self) -> None:
+        # Colons, commas, dots, semicolons, parens, slashes.
+        assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
+            "hello world foo bar baz qux"
+        )
+
+    def test_empty_string(self) -> None:
+        assert normalize_string("") == ""
+
+
+class TestNormalizeNumber:
+    def test_chf_swiss_apostrophe_thousands(self) -> None:
+        assert normalize_number("CHF 1'234.56") == "1234.56"
+
+    def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
+        assert normalize_number("1.234,56 EUR") == "1234.56"
+
+    def test_negative_sign(self) -> None:
+        assert normalize_number("-123.45") == "-123.45"
+        assert normalize_number("CHF -1'234.56") == "-1234.56"
+
+    def test_int_input(self) -> None:
+        assert normalize_number(42) == "42.00"
+
+    def test_float_input(self) -> None:
+        assert normalize_number(1234.5) == "1234.50"
+
+    def test_decimal_input(self) -> None:
+        assert normalize_number(Decimal("1234.56")) == "1234.56"
+
+    def test_trailing_zero_is_canonicalised(self) -> None:
+        assert normalize_number("1234.5") == "1234.50"
+
+    def test_no_decimal_part(self) -> None:
+        assert normalize_number("1234") == "1234.00"
+
+
+class TestNormalizeDate:
+    def test_dayfirst_dotted(self) -> None:
+        assert normalize_date("31.03.2026") == "2026-03-31"
+
+    def test_iso_date(self) -> None:
+        assert normalize_date("2026-03-31") == "2026-03-31"
+
+    def test_date_object(self) -> None:
+        assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
+
+    def test_datetime_object(self) -> None:
+        assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
+
+    def test_slash_variant(self) -> None:
+        assert normalize_date("31/03/2026") == "2026-03-31"
+
+
+class TestNormalizeIban:
+    def test_uppercase_and_strip_whitespace(self) -> None:
+        assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
+
+    def test_already_normalised(self) -> None:
+        assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
+
+    def test_tabs_and_newlines(self) -> None:
+        assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
+
+
+class TestShouldSkipTextAgreement:
+    def test_short_string_skipped(self) -> None:
+        assert should_skip_text_agreement("AB", str) is True
+
+    def test_long_string_not_skipped(self) -> None:
+        assert should_skip_text_agreement("ABC", str) is False
+
+    def test_number_abs_lt_10_skipped(self) -> None:
+        assert should_skip_text_agreement(0, int) is True
+        assert should_skip_text_agreement(9, int) is True
+        assert should_skip_text_agreement(-9, int) is True
+        assert should_skip_text_agreement(9.5, float) is True
+        assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
+
+    def test_number_abs_ge_10_not_skipped(self) -> None:
+        assert should_skip_text_agreement(10, int) is False
+        assert should_skip_text_agreement(-10, int) is False
+        assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
+
+    def test_literal_type_skipped(self) -> None:
+        lit = Literal["checking", "credit", "savings"]
+        assert should_skip_text_agreement("checking", lit) is True
+
+    def test_none_value_skipped(self) -> None:
+        assert should_skip_text_agreement(None, str) is True
+        assert should_skip_text_agreement(None, None) is True
+
+    def test_numeric_string_treated_as_string(self) -> None:
+        # Short stringified numeric values still trip the short-value rule.
+        assert should_skip_text_agreement("9", str) is True