3 changed files with 0 additions and 337 deletions
--- a/src/ix/provenance/init.py
+++ b/src/ix/provenance/init.py
@ -1,32 +0,0 @@
 """Provenance subsystem — normalisers, mapper, verifier.
 Three pieces compose the reliability check:
 * :mod:`ix.provenance.normalize` — pure text/number/date/IBAN normalisers
  used to compare OCR snippets to extracted values.
 * :mod:`ix.provenance.mapper` — resolves LLM-emitted segment IDs to
  :class:`~ix.contracts.provenance.FieldProvenance` entries.
 * :mod:`ix.provenance.verify` — per-field-type dispatcher that writes the
  ``provenance_verified`` / ``text_agreement`` flags.
 Only :mod:`normalize` is exported from the package at this step; the mapper
 and verifier land in task 1.8.
 """
 from __future__ import annotations
 from ix.provenance.normalize import (
    normalize_date,
    normalize_iban,
    normalize_number,
    normalize_string,
    should_skip_text_agreement,
 )
 __all__ = [
    "normalize_date",
    "normalize_iban",
    "normalize_number",
    "normalize_string",
    "should_skip_text_agreement",
 ]
--- a/src/ix/provenance/normalize.py
+++ b/src/ix/provenance/normalize.py
@ -1,181 +0,0 @@
 """Pure normalisers used by the reliability check (spec §6).
 The ReliabilityStep compares extracted values against OCR segment snippets
 (and raw ``context.texts``) after passing both sides through the same
 normaliser. Keeping these functions pure (no IO, no state) means the
 ReliabilityStep itself can stay a thin dispatcher and every rule is
 directly unit-testable.
 All normalisers return ``str`` so the downstream ``substring`` / ``equals``
 comparison is trivial.
 """
 from __future__ import annotations
 import re
 import unicodedata
 from datetime import date, datetime
 from decimal import Decimal, InvalidOperation
 from typing import Any, get_origin
 from dateutil import parser as _dateparser
 # ---------------------------------------------------------------------------
 # String
 # ---------------------------------------------------------------------------
 # Strip punctuation that rarely carries semantics in extracted vs. OCR compare:
 # colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question.
 _PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]")
 _WHITESPACE_RE = re.compile(r"\s+")
 def normalize_string(s: str) -> str:
    """NFKC + casefold + punctuation strip + whitespace collapse."""
    s = unicodedata.normalize("NFKC", s)
    s = s.casefold()
    s = _PUNCTUATION_RE.sub(" ", s)
    s = _WHITESPACE_RE.sub(" ", s).strip()
    return s
 # ---------------------------------------------------------------------------
 # Number
 # ---------------------------------------------------------------------------
 # Strip currency symbols / codes and everything that isn't a digit, sign,
 # apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a
 # second pass that figures out thousands-separator vs. decimal-separator from
 # structure.
 _NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]")
 def _parse_numeric_string(raw: str) -> Decimal:
    """Heuristically decode localised numbers.
    Rules:
    * Strip anything that isn't a digit, sign, dot, comma, apostrophe, or
      whitespace (this drops currency symbols / codes).
    * Apostrophes are always thousands separators (Swiss-German style).
    * Whitespace is always a thousands separator (fr-FR style).
    * If both ``.`` and ``,`` appear, the rightmost is the decimal separator
      and the other is the thousands separator.
    * If only one of them appears: assume it's the decimal separator when it
      has exactly 2 trailing digits, otherwise a thousands separator.
    """
    cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip()
    cleaned = cleaned.replace("'", "").replace(" ", "")
    has_dot = "." in cleaned
    has_comma = "," in cleaned
    if has_dot and has_comma:
        if cleaned.rfind(".") > cleaned.rfind(","):
            # dot is decimal
            cleaned = cleaned.replace(",", "")
        else:
            # comma is decimal
            cleaned = cleaned.replace(".", "").replace(",", ".")
    elif has_comma:
        # Only comma — treat as decimal if 2 digits follow, else thousands.
        tail = cleaned.split(",")[-1]
        if len(tail) == 2 and tail.isdigit():
            cleaned = cleaned.replace(",", ".")
        else:
            cleaned = cleaned.replace(",", "")
    elif has_dot:
        # Only dot — same heuristic in reverse. If multiple dots appear they
        # must be thousands separators (e.g. "1.234.567"); strip them. A
        # single dot with a non-2-digit tail stays as-is (1.5 is 1.5).
        tail = cleaned.split(".")[-1]
        if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1:
            cleaned = cleaned.replace(".", "")
    if cleaned in ("", "+", "-"):
        raise InvalidOperation(f"cannot parse number: {raw!r}")
    return Decimal(cleaned)
 def normalize_number(value: int | float | Decimal | str) -> str:
    """Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places.
    Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``,
    ``"-123.45"``) as well as native numeric types.
    """
    if isinstance(value, Decimal):
        dec = value
    elif isinstance(value, (int, float)):
        dec = Decimal(str(value))
    else:
        dec = _parse_numeric_string(value)
    # Quantize to 2dp; keep sign.
    quantized = dec.quantize(Decimal("0.01"))
    return format(quantized, "f")
 # ---------------------------------------------------------------------------
 # Date
 # ---------------------------------------------------------------------------
 def normalize_date(value: date | datetime | str) -> str:
    """Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``."""
    if isinstance(value, datetime):
        return value.date().isoformat()
    if isinstance(value, date):
        return value.isoformat()
    parsed = _dateparser.parse(value, dayfirst=True)
    return parsed.date().isoformat()
 # ---------------------------------------------------------------------------
 # IBAN
 # ---------------------------------------------------------------------------
 def normalize_iban(s: str) -> str:
    """Upper-case + strip all whitespace. No format validation (call site's job)."""
    return "".join(s.split()).upper()
 # ---------------------------------------------------------------------------
 # Short-value skip rule
 # ---------------------------------------------------------------------------
 def should_skip_text_agreement(value: Any, field_type: Any) -> bool:
    """Return True when ``text_agreement`` should be recorded as ``None``.
    Rules (spec §6 ReliabilityStep):
    1. ``value is None`` → skip.
    2. ``field_type`` is a ``Literal[...]`` → skip (enum labels don't appear
       verbatim in the source text).
    3. Stringified value length ≤ 2 chars → skip (short strings collide with
       random OCR noise).
    4. Numeric value (int/float/Decimal) with ``|v| < 10`` → skip.
    ``provenance_verified`` still runs for all of these — the bbox-anchored
    cite is stronger than a global text scan for short values.
    """
    if value is None:
        return True
    # Literal check — Python 3.12 returns `typing.Literal` from get_origin.
    import typing
    if get_origin(field_type) is typing.Literal:
        return True
    # Numeric short-value rule — check before the stringified-length rule so
    # that "10" (len 2) is still considered on the numeric side. Booleans
    # are a subtype of int; we exclude them so they fall through to the
    # string rule ("True" has len 4 so it doesn't trip anyway).
    if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)):
        try:
            return abs(Decimal(str(value))) < 10
        except InvalidOperation:
            pass
    # Stringified length rule (strings and anything not numeric).
    return len(str(value)) <= 2
--- a/tests/unit/test_provenance_normalize.py
+++ b/tests/unit/test_provenance_normalize.py
@ -1,124 +0,0 @@
 """Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
 from __future__ import annotations
 from datetime import date, datetime
 from decimal import Decimal
 from typing import Literal
 from ix.provenance.normalize import (
    normalize_date,
    normalize_iban,
    normalize_number,
    normalize_string,
    should_skip_text_agreement,
 )
 class TestNormalizeString:
    def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
        assert normalize_string("  FOO bar!!!  ") == "foo bar"
    def test_nfkc_applied_for_fullwidth(self) -> None:
        # Fullwidth capital letters should NFKC-decompose to ASCII.
        fullwidth_ubs = "\uff35\uff22\uff33"  # "UBS" in U+FF00 fullwidth block
        assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
    def test_whitespace_collapse(self) -> None:
        assert normalize_string("UBS   Switzerland\tAG") == "ubs switzerland ag"
    def test_strips_common_punctuation(self) -> None:
        # Colons, commas, dots, semicolons, parens, slashes.
        assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
            "hello world foo bar baz qux"
        )
    def test_empty_string(self) -> None:
        assert normalize_string("") == ""
 class TestNormalizeNumber:
    def test_chf_swiss_apostrophe_thousands(self) -> None:
        assert normalize_number("CHF 1'234.56") == "1234.56"
    def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
        assert normalize_number("1.234,56 EUR") == "1234.56"
    def test_negative_sign(self) -> None:
        assert normalize_number("-123.45") == "-123.45"
        assert normalize_number("CHF -1'234.56") == "-1234.56"
    def test_int_input(self) -> None:
        assert normalize_number(42) == "42.00"
    def test_float_input(self) -> None:
        assert normalize_number(1234.5) == "1234.50"
    def test_decimal_input(self) -> None:
        assert normalize_number(Decimal("1234.56")) == "1234.56"
    def test_trailing_zero_is_canonicalised(self) -> None:
        assert normalize_number("1234.5") == "1234.50"
    def test_no_decimal_part(self) -> None:
        assert normalize_number("1234") == "1234.00"
 class TestNormalizeDate:
    def test_dayfirst_dotted(self) -> None:
        assert normalize_date("31.03.2026") == "2026-03-31"
    def test_iso_date(self) -> None:
        assert normalize_date("2026-03-31") == "2026-03-31"
    def test_date_object(self) -> None:
        assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
    def test_datetime_object(self) -> None:
        assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
    def test_slash_variant(self) -> None:
        assert normalize_date("31/03/2026") == "2026-03-31"
 class TestNormalizeIban:
    def test_uppercase_and_strip_whitespace(self) -> None:
        assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
    def test_already_normalised(self) -> None:
        assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
    def test_tabs_and_newlines(self) -> None:
        assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
 class TestShouldSkipTextAgreement:
    def test_short_string_skipped(self) -> None:
        assert should_skip_text_agreement("AB", str) is True
    def test_long_string_not_skipped(self) -> None:
        assert should_skip_text_agreement("ABC", str) is False
    def test_number_abs_lt_10_skipped(self) -> None:
        assert should_skip_text_agreement(0, int) is True
        assert should_skip_text_agreement(9, int) is True
        assert should_skip_text_agreement(-9, int) is True
        assert should_skip_text_agreement(9.5, float) is True
        assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
    def test_number_abs_ge_10_not_skipped(self) -> None:
        assert should_skip_text_agreement(10, int) is False
        assert should_skip_text_agreement(-10, int) is False
        assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
    def test_literal_type_skipped(self) -> None:
        lit = Literal["checking", "credit", "savings"]
        assert should_skip_text_agreement("checking", lit) is True
    def test_none_value_skipped(self) -> None:
        assert should_skip_text_agreement(None, str) is True
        assert should_skip_text_agreement(None, None) is True
    def test_numeric_string_treated_as_string(self) -> None:
        # Short stringified numeric values still trip the short-value rule.
        assert should_skip_text_agreement("9", str) is True