Compare commits

..

2 commits

Author SHA1 Message Date
2d22115893 feat(provenance): normalisers + short-value skip rule (spec §6) (#7)
Some checks are pending
tests / test (push) Waiting to run
Normalizer primitives land.
2026-04-18 08:56:45 +00:00
527fc620fe feat(provenance): normalisers + short-value skip rule (spec §6)
All checks were successful
tests / test (pull_request) Successful in 1m0s
tests / test (push) Successful in 1m28s
Pure functions the ReliabilityStep will compose to compare extracted values
against OCR snippets (and context.texts). Kept in one module so every rule
is directly unit-testable without pulling in the step ABC.

Highlights:

- `normalize_string`: NFKC + casefold + strip common punctuation (. , : ; !
  ? () [] {} / \\ ' " `) + collapse whitespace. Substring-compatible.

- `normalize_number`: returns the canonical "[-]DDD.DD" form (always 2dp)
  after stripping currency symbols. Heuristic separator detection handles
  Swiss-German apostrophes ("1'234.56"), de-DE commas ("1.234,56"), and
  plain ASCII ("1234.56" / "1234.5" / "1234"). Accepts native int/float/
  Decimal as well as str.

- `normalize_date`: dateutil parse with dayfirst=True → ISO YYYY-MM-DD.
  Date and datetime objects short-circuit to their isoformat().

- `normalize_iban`: uppercase + strip whitespace. Format validation is the
  call site's job; this is pure canonicalisation.

- `should_skip_text_agreement`: dispatches on type + value. Literal → skip,
  None → skip, numeric |v|<10 → skip, len(str) ≤ 2 → skip. Numeric check
  runs first so `10` (len("10")==2) is treated on the numeric side
  (not skipped) instead of tripping the string length rule.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 10:56:31 +02:00
3 changed files with 337 additions and 0 deletions

View file

@ -0,0 +1,32 @@
"""Provenance subsystem — normalisers, mapper, verifier.
Three pieces compose the reliability check:
* :mod:`ix.provenance.normalize` pure text/number/date/IBAN normalisers
used to compare OCR snippets to extracted values.
* :mod:`ix.provenance.mapper` resolves LLM-emitted segment IDs to
:class:`~ix.contracts.provenance.FieldProvenance` entries.
* :mod:`ix.provenance.verify` per-field-type dispatcher that writes the
``provenance_verified`` / ``text_agreement`` flags.
Only :mod:`normalize` is exported from the package at this step; the mapper
and verifier land in task 1.8.
"""
from __future__ import annotations
from ix.provenance.normalize import (
normalize_date,
normalize_iban,
normalize_number,
normalize_string,
should_skip_text_agreement,
)
__all__ = [
"normalize_date",
"normalize_iban",
"normalize_number",
"normalize_string",
"should_skip_text_agreement",
]

View file

@ -0,0 +1,181 @@
"""Pure normalisers used by the reliability check (spec §6).
The ReliabilityStep compares extracted values against OCR segment snippets
(and raw ``context.texts``) after passing both sides through the same
normaliser. Keeping these functions pure (no IO, no state) means the
ReliabilityStep itself can stay a thin dispatcher and every rule is
directly unit-testable.
All normalisers return ``str`` so the downstream ``substring`` / ``equals``
comparison is trivial.
"""
from __future__ import annotations
import re
import unicodedata
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Any, get_origin
from dateutil import parser as _dateparser
# ---------------------------------------------------------------------------
# String
# ---------------------------------------------------------------------------
# Strip punctuation that rarely carries semantics in extracted vs. OCR compare:
# colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question.
_PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]")
_WHITESPACE_RE = re.compile(r"\s+")
def normalize_string(s: str) -> str:
"""NFKC + casefold + punctuation strip + whitespace collapse."""
s = unicodedata.normalize("NFKC", s)
s = s.casefold()
s = _PUNCTUATION_RE.sub(" ", s)
s = _WHITESPACE_RE.sub(" ", s).strip()
return s
# ---------------------------------------------------------------------------
# Number
# ---------------------------------------------------------------------------
# Strip currency symbols / codes and everything that isn't a digit, sign,
# apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a
# second pass that figures out thousands-separator vs. decimal-separator from
# structure.
_NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]")
def _parse_numeric_string(raw: str) -> Decimal:
"""Heuristically decode localised numbers.
Rules:
* Strip anything that isn't a digit, sign, dot, comma, apostrophe, or
whitespace (this drops currency symbols / codes).
* Apostrophes are always thousands separators (Swiss-German style).
* Whitespace is always a thousands separator (fr-FR style).
* If both ``.`` and ``,`` appear, the rightmost is the decimal separator
and the other is the thousands separator.
* If only one of them appears: assume it's the decimal separator when it
has exactly 2 trailing digits, otherwise a thousands separator.
"""
cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip()
cleaned = cleaned.replace("'", "").replace(" ", "")
has_dot = "." in cleaned
has_comma = "," in cleaned
if has_dot and has_comma:
if cleaned.rfind(".") > cleaned.rfind(","):
# dot is decimal
cleaned = cleaned.replace(",", "")
else:
# comma is decimal
cleaned = cleaned.replace(".", "").replace(",", ".")
elif has_comma:
# Only comma — treat as decimal if 2 digits follow, else thousands.
tail = cleaned.split(",")[-1]
if len(tail) == 2 and tail.isdigit():
cleaned = cleaned.replace(",", ".")
else:
cleaned = cleaned.replace(",", "")
elif has_dot:
# Only dot — same heuristic in reverse. If multiple dots appear they
# must be thousands separators (e.g. "1.234.567"); strip them. A
# single dot with a non-2-digit tail stays as-is (1.5 is 1.5).
tail = cleaned.split(".")[-1]
if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1:
cleaned = cleaned.replace(".", "")
if cleaned in ("", "+", "-"):
raise InvalidOperation(f"cannot parse number: {raw!r}")
return Decimal(cleaned)
def normalize_number(value: int | float | Decimal | str) -> str:
"""Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places.
Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``,
``"-123.45"``) as well as native numeric types.
"""
if isinstance(value, Decimal):
dec = value
elif isinstance(value, (int, float)):
dec = Decimal(str(value))
else:
dec = _parse_numeric_string(value)
# Quantize to 2dp; keep sign.
quantized = dec.quantize(Decimal("0.01"))
return format(quantized, "f")
# ---------------------------------------------------------------------------
# Date
# ---------------------------------------------------------------------------
def normalize_date(value: date | datetime | str) -> str:
"""Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``."""
if isinstance(value, datetime):
return value.date().isoformat()
if isinstance(value, date):
return value.isoformat()
parsed = _dateparser.parse(value, dayfirst=True)
return parsed.date().isoformat()
# ---------------------------------------------------------------------------
# IBAN
# ---------------------------------------------------------------------------
def normalize_iban(s: str) -> str:
"""Upper-case + strip all whitespace. No format validation (call site's job)."""
return "".join(s.split()).upper()
# ---------------------------------------------------------------------------
# Short-value skip rule
# ---------------------------------------------------------------------------
def should_skip_text_agreement(value: Any, field_type: Any) -> bool:
"""Return True when ``text_agreement`` should be recorded as ``None``.
Rules (spec §6 ReliabilityStep):
1. ``value is None`` skip.
2. ``field_type`` is a ``Literal[...]`` skip (enum labels don't appear
verbatim in the source text).
3. Stringified value length 2 chars skip (short strings collide with
random OCR noise).
4. Numeric value (int/float/Decimal) with ``|v| < 10`` skip.
``provenance_verified`` still runs for all of these the bbox-anchored
cite is stronger than a global text scan for short values.
"""
if value is None:
return True
# Literal check — Python 3.12 returns `typing.Literal` from get_origin.
import typing
if get_origin(field_type) is typing.Literal:
return True
# Numeric short-value rule — check before the stringified-length rule so
# that "10" (len 2) is still considered on the numeric side. Booleans
# are a subtype of int; we exclude them so they fall through to the
# string rule ("True" has len 4 so it doesn't trip anyway).
if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)):
try:
return abs(Decimal(str(value))) < 10
except InvalidOperation:
pass
# Stringified length rule (strings and anything not numeric).
return len(str(value)) <= 2

View file

@ -0,0 +1,124 @@
"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
from __future__ import annotations
from datetime import date, datetime
from decimal import Decimal
from typing import Literal
from ix.provenance.normalize import (
normalize_date,
normalize_iban,
normalize_number,
normalize_string,
should_skip_text_agreement,
)
class TestNormalizeString:
def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
assert normalize_string(" FOO bar!!! ") == "foo bar"
def test_nfkc_applied_for_fullwidth(self) -> None:
# Fullwidth capital letters should NFKC-decompose to ASCII.
fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block
assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
def test_whitespace_collapse(self) -> None:
assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag"
def test_strips_common_punctuation(self) -> None:
# Colons, commas, dots, semicolons, parens, slashes.
assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
"hello world foo bar baz qux"
)
def test_empty_string(self) -> None:
assert normalize_string("") == ""
class TestNormalizeNumber:
def test_chf_swiss_apostrophe_thousands(self) -> None:
assert normalize_number("CHF 1'234.56") == "1234.56"
def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
assert normalize_number("1.234,56 EUR") == "1234.56"
def test_negative_sign(self) -> None:
assert normalize_number("-123.45") == "-123.45"
assert normalize_number("CHF -1'234.56") == "-1234.56"
def test_int_input(self) -> None:
assert normalize_number(42) == "42.00"
def test_float_input(self) -> None:
assert normalize_number(1234.5) == "1234.50"
def test_decimal_input(self) -> None:
assert normalize_number(Decimal("1234.56")) == "1234.56"
def test_trailing_zero_is_canonicalised(self) -> None:
assert normalize_number("1234.5") == "1234.50"
def test_no_decimal_part(self) -> None:
assert normalize_number("1234") == "1234.00"
class TestNormalizeDate:
def test_dayfirst_dotted(self) -> None:
assert normalize_date("31.03.2026") == "2026-03-31"
def test_iso_date(self) -> None:
assert normalize_date("2026-03-31") == "2026-03-31"
def test_date_object(self) -> None:
assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
def test_datetime_object(self) -> None:
assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
def test_slash_variant(self) -> None:
assert normalize_date("31/03/2026") == "2026-03-31"
class TestNormalizeIban:
def test_uppercase_and_strip_whitespace(self) -> None:
assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
def test_already_normalised(self) -> None:
assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
def test_tabs_and_newlines(self) -> None:
assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
class TestShouldSkipTextAgreement:
def test_short_string_skipped(self) -> None:
assert should_skip_text_agreement("AB", str) is True
def test_long_string_not_skipped(self) -> None:
assert should_skip_text_agreement("ABC", str) is False
def test_number_abs_lt_10_skipped(self) -> None:
assert should_skip_text_agreement(0, int) is True
assert should_skip_text_agreement(9, int) is True
assert should_skip_text_agreement(-9, int) is True
assert should_skip_text_agreement(9.5, float) is True
assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
def test_number_abs_ge_10_not_skipped(self) -> None:
assert should_skip_text_agreement(10, int) is False
assert should_skip_text_agreement(-10, int) is False
assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
def test_literal_type_skipped(self) -> None:
lit = Literal["checking", "credit", "savings"]
assert should_skip_text_agreement("checking", lit) is True
def test_none_value_skipped(self) -> None:
assert should_skip_text_agreement(None, str) is True
assert should_skip_text_agreement(None, None) is True
def test_numeric_string_treated_as_string(self) -> None:
# Short stringified numeric values still trip the short-value rule.
assert should_skip_text_agreement("9", str) is True