diff --git a/src/ix/provenance/__init__.py b/src/ix/provenance/__init__.py new file mode 100644 index 0000000..3028fc5 --- /dev/null +++ b/src/ix/provenance/__init__.py @@ -0,0 +1,32 @@ +"""Provenance subsystem — normalisers, mapper, verifier. + +Three pieces compose the reliability check: + +* :mod:`ix.provenance.normalize` — pure text/number/date/IBAN normalisers + used to compare OCR snippets to extracted values. +* :mod:`ix.provenance.mapper` — resolves LLM-emitted segment IDs to + :class:`~ix.contracts.provenance.FieldProvenance` entries. +* :mod:`ix.provenance.verify` — per-field-type dispatcher that writes the + ``provenance_verified`` / ``text_agreement`` flags. + +Only :mod:`normalize` is exported from the package at this step; the mapper +and verifier land in task 1.8. +""" + +from __future__ import annotations + +from ix.provenance.normalize import ( + normalize_date, + normalize_iban, + normalize_number, + normalize_string, + should_skip_text_agreement, +) + +__all__ = [ + "normalize_date", + "normalize_iban", + "normalize_number", + "normalize_string", + "should_skip_text_agreement", +] diff --git a/src/ix/provenance/normalize.py b/src/ix/provenance/normalize.py new file mode 100644 index 0000000..554d734 --- /dev/null +++ b/src/ix/provenance/normalize.py @@ -0,0 +1,181 @@ +"""Pure normalisers used by the reliability check (spec §6). + +The ReliabilityStep compares extracted values against OCR segment snippets +(and raw ``context.texts``) after passing both sides through the same +normaliser. Keeping these functions pure (no IO, no state) means the +ReliabilityStep itself can stay a thin dispatcher and every rule is +directly unit-testable. + +All normalisers return ``str`` so the downstream ``substring`` / ``equals`` +comparison is trivial. +""" + +from __future__ import annotations + +import re +import unicodedata +from datetime import date, datetime +from decimal import Decimal, InvalidOperation +from typing import Any, get_origin + +from dateutil import parser as _dateparser + +# --------------------------------------------------------------------------- +# String +# --------------------------------------------------------------------------- + +# Strip punctuation that rarely carries semantics in extracted vs. OCR compare: +# colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question. +_PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]") +_WHITESPACE_RE = re.compile(r"\s+") + + +def normalize_string(s: str) -> str: + """NFKC + casefold + punctuation strip + whitespace collapse.""" + s = unicodedata.normalize("NFKC", s) + s = s.casefold() + s = _PUNCTUATION_RE.sub(" ", s) + s = _WHITESPACE_RE.sub(" ", s).strip() + return s + + +# --------------------------------------------------------------------------- +# Number +# --------------------------------------------------------------------------- + +# Strip currency symbols / codes and everything that isn't a digit, sign, +# apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a +# second pass that figures out thousands-separator vs. decimal-separator from +# structure. +_NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]") + + +def _parse_numeric_string(raw: str) -> Decimal: + """Heuristically decode localised numbers. + + Rules: + + * Strip anything that isn't a digit, sign, dot, comma, apostrophe, or + whitespace (this drops currency symbols / codes). + * Apostrophes are always thousands separators (Swiss-German style). + * Whitespace is always a thousands separator (fr-FR style). + * If both ``.`` and ``,`` appear, the rightmost is the decimal separator + and the other is the thousands separator. + * If only one of them appears: assume it's the decimal separator when it + has exactly 2 trailing digits, otherwise a thousands separator. + """ + cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip() + cleaned = cleaned.replace("'", "").replace(" ", "") + + has_dot = "." in cleaned + has_comma = "," in cleaned + + if has_dot and has_comma: + if cleaned.rfind(".") > cleaned.rfind(","): + # dot is decimal + cleaned = cleaned.replace(",", "") + else: + # comma is decimal + cleaned = cleaned.replace(".", "").replace(",", ".") + elif has_comma: + # Only comma — treat as decimal if 2 digits follow, else thousands. + tail = cleaned.split(",")[-1] + if len(tail) == 2 and tail.isdigit(): + cleaned = cleaned.replace(",", ".") + else: + cleaned = cleaned.replace(",", "") + elif has_dot: + # Only dot — same heuristic in reverse. If multiple dots appear they + # must be thousands separators (e.g. "1.234.567"); strip them. A + # single dot with a non-2-digit tail stays as-is (1.5 is 1.5). + tail = cleaned.split(".")[-1] + if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1: + cleaned = cleaned.replace(".", "") + if cleaned in ("", "+", "-"): + raise InvalidOperation(f"cannot parse number: {raw!r}") + return Decimal(cleaned) + + +def normalize_number(value: int | float | Decimal | str) -> str: + """Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places. + + Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``, + ``"-123.45"``) as well as native numeric types. + """ + if isinstance(value, Decimal): + dec = value + elif isinstance(value, (int, float)): + dec = Decimal(str(value)) + else: + dec = _parse_numeric_string(value) + # Quantize to 2dp; keep sign. + quantized = dec.quantize(Decimal("0.01")) + return format(quantized, "f") + + +# --------------------------------------------------------------------------- +# Date +# --------------------------------------------------------------------------- + + +def normalize_date(value: date | datetime | str) -> str: + """Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``.""" + if isinstance(value, datetime): + return value.date().isoformat() + if isinstance(value, date): + return value.isoformat() + parsed = _dateparser.parse(value, dayfirst=True) + return parsed.date().isoformat() + + +# --------------------------------------------------------------------------- +# IBAN +# --------------------------------------------------------------------------- + + +def normalize_iban(s: str) -> str: + """Upper-case + strip all whitespace. No format validation (call site's job).""" + return "".join(s.split()).upper() + + +# --------------------------------------------------------------------------- +# Short-value skip rule +# --------------------------------------------------------------------------- + + +def should_skip_text_agreement(value: Any, field_type: Any) -> bool: + """Return True when ``text_agreement`` should be recorded as ``None``. + + Rules (spec §6 ReliabilityStep): + + 1. ``value is None`` → skip. + 2. ``field_type`` is a ``Literal[...]`` → skip (enum labels don't appear + verbatim in the source text). + 3. Stringified value length ≤ 2 chars → skip (short strings collide with + random OCR noise). + 4. Numeric value (int/float/Decimal) with ``|v| < 10`` → skip. + + ``provenance_verified`` still runs for all of these — the bbox-anchored + cite is stronger than a global text scan for short values. + """ + if value is None: + return True + + # Literal check — Python 3.12 returns `typing.Literal` from get_origin. + import typing + + if get_origin(field_type) is typing.Literal: + return True + + # Numeric short-value rule — check before the stringified-length rule so + # that "10" (len 2) is still considered on the numeric side. Booleans + # are a subtype of int; we exclude them so they fall through to the + # string rule ("True" has len 4 so it doesn't trip anyway). + if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)): + try: + return abs(Decimal(str(value))) < 10 + except InvalidOperation: + pass + + # Stringified length rule (strings and anything not numeric). + return len(str(value)) <= 2 diff --git a/tests/unit/test_provenance_normalize.py b/tests/unit/test_provenance_normalize.py new file mode 100644 index 0000000..ca12ffa --- /dev/null +++ b/tests/unit/test_provenance_normalize.py @@ -0,0 +1,124 @@ +"""Tests for the provenance normalisers (spec §6 ReliabilityStep).""" + +from __future__ import annotations + +from datetime import date, datetime +from decimal import Decimal +from typing import Literal + +from ix.provenance.normalize import ( + normalize_date, + normalize_iban, + normalize_number, + normalize_string, + should_skip_text_agreement, +) + + +class TestNormalizeString: + def test_uppercase_casefolded_and_punctuation_stripped(self) -> None: + assert normalize_string(" FOO bar!!! ") == "foo bar" + + def test_nfkc_applied_for_fullwidth(self) -> None: + # Fullwidth capital letters should NFKC-decompose to ASCII. + fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block + assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag" + + def test_whitespace_collapse(self) -> None: + assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag" + + def test_strips_common_punctuation(self) -> None: + # Colons, commas, dots, semicolons, parens, slashes. + assert normalize_string("Hello, World. (foo); bar: baz / qux") == ( + "hello world foo bar baz qux" + ) + + def test_empty_string(self) -> None: + assert normalize_string("") == "" + + +class TestNormalizeNumber: + def test_chf_swiss_apostrophe_thousands(self) -> None: + assert normalize_number("CHF 1'234.56") == "1234.56" + + def test_de_de_dot_thousands_and_comma_decimal(self) -> None: + assert normalize_number("1.234,56 EUR") == "1234.56" + + def test_negative_sign(self) -> None: + assert normalize_number("-123.45") == "-123.45" + assert normalize_number("CHF -1'234.56") == "-1234.56" + + def test_int_input(self) -> None: + assert normalize_number(42) == "42.00" + + def test_float_input(self) -> None: + assert normalize_number(1234.5) == "1234.50" + + def test_decimal_input(self) -> None: + assert normalize_number(Decimal("1234.56")) == "1234.56" + + def test_trailing_zero_is_canonicalised(self) -> None: + assert normalize_number("1234.5") == "1234.50" + + def test_no_decimal_part(self) -> None: + assert normalize_number("1234") == "1234.00" + + +class TestNormalizeDate: + def test_dayfirst_dotted(self) -> None: + assert normalize_date("31.03.2026") == "2026-03-31" + + def test_iso_date(self) -> None: + assert normalize_date("2026-03-31") == "2026-03-31" + + def test_date_object(self) -> None: + assert normalize_date(date(2026, 3, 31)) == "2026-03-31" + + def test_datetime_object(self) -> None: + assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31" + + def test_slash_variant(self) -> None: + assert normalize_date("31/03/2026") == "2026-03-31" + + +class TestNormalizeIban: + def test_uppercase_and_strip_whitespace(self) -> None: + assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000" + + def test_already_normalised(self) -> None: + assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957" + + def test_tabs_and_newlines(self) -> None: + assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957" + + +class TestShouldSkipTextAgreement: + def test_short_string_skipped(self) -> None: + assert should_skip_text_agreement("AB", str) is True + + def test_long_string_not_skipped(self) -> None: + assert should_skip_text_agreement("ABC", str) is False + + def test_number_abs_lt_10_skipped(self) -> None: + assert should_skip_text_agreement(0, int) is True + assert should_skip_text_agreement(9, int) is True + assert should_skip_text_agreement(-9, int) is True + assert should_skip_text_agreement(9.5, float) is True + assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True + + def test_number_abs_ge_10_not_skipped(self) -> None: + assert should_skip_text_agreement(10, int) is False + assert should_skip_text_agreement(-10, int) is False + assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False + + def test_literal_type_skipped(self) -> None: + lit = Literal["checking", "credit", "savings"] + assert should_skip_text_agreement("checking", lit) is True + + def test_none_value_skipped(self) -> None: + assert should_skip_text_agreement(None, str) is True + assert should_skip_text_agreement(None, None) is True + + def test_numeric_string_treated_as_string(self) -> None: + # Short stringified numeric values still trip the short-value rule. + assert should_skip_text_agreement("9", str) is True