Compare commits
2 commits
b2ff27c1ca
...
2d22115893
| Author | SHA1 | Date | |
|---|---|---|---|
| 2d22115893 | |||
| 527fc620fe |
3 changed files with 337 additions and 0 deletions
32
src/ix/provenance/__init__.py
Normal file
32
src/ix/provenance/__init__.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
"""Provenance subsystem — normalisers, mapper, verifier.
|
||||
|
||||
Three pieces compose the reliability check:
|
||||
|
||||
* :mod:`ix.provenance.normalize` — pure text/number/date/IBAN normalisers
|
||||
used to compare OCR snippets to extracted values.
|
||||
* :mod:`ix.provenance.mapper` — resolves LLM-emitted segment IDs to
|
||||
:class:`~ix.contracts.provenance.FieldProvenance` entries.
|
||||
* :mod:`ix.provenance.verify` — per-field-type dispatcher that writes the
|
||||
``provenance_verified`` / ``text_agreement`` flags.
|
||||
|
||||
Only :mod:`normalize` is exported from the package at this step; the mapper
|
||||
and verifier land in task 1.8.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ix.provenance.normalize import (
|
||||
normalize_date,
|
||||
normalize_iban,
|
||||
normalize_number,
|
||||
normalize_string,
|
||||
should_skip_text_agreement,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"normalize_date",
|
||||
"normalize_iban",
|
||||
"normalize_number",
|
||||
"normalize_string",
|
||||
"should_skip_text_agreement",
|
||||
]
|
||||
181
src/ix/provenance/normalize.py
Normal file
181
src/ix/provenance/normalize.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
"""Pure normalisers used by the reliability check (spec §6).
|
||||
|
||||
The ReliabilityStep compares extracted values against OCR segment snippets
|
||||
(and raw ``context.texts``) after passing both sides through the same
|
||||
normaliser. Keeping these functions pure (no IO, no state) means the
|
||||
ReliabilityStep itself can stay a thin dispatcher and every rule is
|
||||
directly unit-testable.
|
||||
|
||||
All normalisers return ``str`` so the downstream ``substring`` / ``equals``
|
||||
comparison is trivial.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Any, get_origin
|
||||
|
||||
from dateutil import parser as _dateparser
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# String
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Strip punctuation that rarely carries semantics in extracted vs. OCR compare:
|
||||
# colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question.
|
||||
_PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]")
|
||||
_WHITESPACE_RE = re.compile(r"\s+")
|
||||
|
||||
|
||||
def normalize_string(s: str) -> str:
|
||||
"""NFKC + casefold + punctuation strip + whitespace collapse."""
|
||||
s = unicodedata.normalize("NFKC", s)
|
||||
s = s.casefold()
|
||||
s = _PUNCTUATION_RE.sub(" ", s)
|
||||
s = _WHITESPACE_RE.sub(" ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Number
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Strip currency symbols / codes and everything that isn't a digit, sign,
|
||||
# apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a
|
||||
# second pass that figures out thousands-separator vs. decimal-separator from
|
||||
# structure.
|
||||
_NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]")
|
||||
|
||||
|
||||
def _parse_numeric_string(raw: str) -> Decimal:
|
||||
"""Heuristically decode localised numbers.
|
||||
|
||||
Rules:
|
||||
|
||||
* Strip anything that isn't a digit, sign, dot, comma, apostrophe, or
|
||||
whitespace (this drops currency symbols / codes).
|
||||
* Apostrophes are always thousands separators (Swiss-German style).
|
||||
* Whitespace is always a thousands separator (fr-FR style).
|
||||
* If both ``.`` and ``,`` appear, the rightmost is the decimal separator
|
||||
and the other is the thousands separator.
|
||||
* If only one of them appears: assume it's the decimal separator when it
|
||||
has exactly 2 trailing digits, otherwise a thousands separator.
|
||||
"""
|
||||
cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip()
|
||||
cleaned = cleaned.replace("'", "").replace(" ", "")
|
||||
|
||||
has_dot = "." in cleaned
|
||||
has_comma = "," in cleaned
|
||||
|
||||
if has_dot and has_comma:
|
||||
if cleaned.rfind(".") > cleaned.rfind(","):
|
||||
# dot is decimal
|
||||
cleaned = cleaned.replace(",", "")
|
||||
else:
|
||||
# comma is decimal
|
||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
||||
elif has_comma:
|
||||
# Only comma — treat as decimal if 2 digits follow, else thousands.
|
||||
tail = cleaned.split(",")[-1]
|
||||
if len(tail) == 2 and tail.isdigit():
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
else:
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif has_dot:
|
||||
# Only dot — same heuristic in reverse. If multiple dots appear they
|
||||
# must be thousands separators (e.g. "1.234.567"); strip them. A
|
||||
# single dot with a non-2-digit tail stays as-is (1.5 is 1.5).
|
||||
tail = cleaned.split(".")[-1]
|
||||
if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1:
|
||||
cleaned = cleaned.replace(".", "")
|
||||
if cleaned in ("", "+", "-"):
|
||||
raise InvalidOperation(f"cannot parse number: {raw!r}")
|
||||
return Decimal(cleaned)
|
||||
|
||||
|
||||
def normalize_number(value: int | float | Decimal | str) -> str:
|
||||
"""Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places.
|
||||
|
||||
Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``,
|
||||
``"-123.45"``) as well as native numeric types.
|
||||
"""
|
||||
if isinstance(value, Decimal):
|
||||
dec = value
|
||||
elif isinstance(value, (int, float)):
|
||||
dec = Decimal(str(value))
|
||||
else:
|
||||
dec = _parse_numeric_string(value)
|
||||
# Quantize to 2dp; keep sign.
|
||||
quantized = dec.quantize(Decimal("0.01"))
|
||||
return format(quantized, "f")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Date
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def normalize_date(value: date | datetime | str) -> str:
|
||||
"""Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``."""
|
||||
if isinstance(value, datetime):
|
||||
return value.date().isoformat()
|
||||
if isinstance(value, date):
|
||||
return value.isoformat()
|
||||
parsed = _dateparser.parse(value, dayfirst=True)
|
||||
return parsed.date().isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IBAN
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def normalize_iban(s: str) -> str:
|
||||
"""Upper-case + strip all whitespace. No format validation (call site's job)."""
|
||||
return "".join(s.split()).upper()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Short-value skip rule
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def should_skip_text_agreement(value: Any, field_type: Any) -> bool:
|
||||
"""Return True when ``text_agreement`` should be recorded as ``None``.
|
||||
|
||||
Rules (spec §6 ReliabilityStep):
|
||||
|
||||
1. ``value is None`` → skip.
|
||||
2. ``field_type`` is a ``Literal[...]`` → skip (enum labels don't appear
|
||||
verbatim in the source text).
|
||||
3. Stringified value length ≤ 2 chars → skip (short strings collide with
|
||||
random OCR noise).
|
||||
4. Numeric value (int/float/Decimal) with ``|v| < 10`` → skip.
|
||||
|
||||
``provenance_verified`` still runs for all of these — the bbox-anchored
|
||||
cite is stronger than a global text scan for short values.
|
||||
"""
|
||||
if value is None:
|
||||
return True
|
||||
|
||||
# Literal check — Python 3.12 returns `typing.Literal` from get_origin.
|
||||
import typing
|
||||
|
||||
if get_origin(field_type) is typing.Literal:
|
||||
return True
|
||||
|
||||
# Numeric short-value rule — check before the stringified-length rule so
|
||||
# that "10" (len 2) is still considered on the numeric side. Booleans
|
||||
# are a subtype of int; we exclude them so they fall through to the
|
||||
# string rule ("True" has len 4 so it doesn't trip anyway).
|
||||
if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)):
|
||||
try:
|
||||
return abs(Decimal(str(value))) < 10
|
||||
except InvalidOperation:
|
||||
pass
|
||||
|
||||
# Stringified length rule (strings and anything not numeric).
|
||||
return len(str(value)) <= 2
|
||||
124
tests/unit/test_provenance_normalize.py
Normal file
124
tests/unit/test_provenance_normalize.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Literal
|
||||
|
||||
from ix.provenance.normalize import (
|
||||
normalize_date,
|
||||
normalize_iban,
|
||||
normalize_number,
|
||||
normalize_string,
|
||||
should_skip_text_agreement,
|
||||
)
|
||||
|
||||
|
||||
class TestNormalizeString:
|
||||
def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
|
||||
assert normalize_string(" FOO bar!!! ") == "foo bar"
|
||||
|
||||
def test_nfkc_applied_for_fullwidth(self) -> None:
|
||||
# Fullwidth capital letters should NFKC-decompose to ASCII.
|
||||
fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block
|
||||
assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
|
||||
|
||||
def test_whitespace_collapse(self) -> None:
|
||||
assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag"
|
||||
|
||||
def test_strips_common_punctuation(self) -> None:
|
||||
# Colons, commas, dots, semicolons, parens, slashes.
|
||||
assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
|
||||
"hello world foo bar baz qux"
|
||||
)
|
||||
|
||||
def test_empty_string(self) -> None:
|
||||
assert normalize_string("") == ""
|
||||
|
||||
|
||||
class TestNormalizeNumber:
|
||||
def test_chf_swiss_apostrophe_thousands(self) -> None:
|
||||
assert normalize_number("CHF 1'234.56") == "1234.56"
|
||||
|
||||
def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
|
||||
assert normalize_number("1.234,56 EUR") == "1234.56"
|
||||
|
||||
def test_negative_sign(self) -> None:
|
||||
assert normalize_number("-123.45") == "-123.45"
|
||||
assert normalize_number("CHF -1'234.56") == "-1234.56"
|
||||
|
||||
def test_int_input(self) -> None:
|
||||
assert normalize_number(42) == "42.00"
|
||||
|
||||
def test_float_input(self) -> None:
|
||||
assert normalize_number(1234.5) == "1234.50"
|
||||
|
||||
def test_decimal_input(self) -> None:
|
||||
assert normalize_number(Decimal("1234.56")) == "1234.56"
|
||||
|
||||
def test_trailing_zero_is_canonicalised(self) -> None:
|
||||
assert normalize_number("1234.5") == "1234.50"
|
||||
|
||||
def test_no_decimal_part(self) -> None:
|
||||
assert normalize_number("1234") == "1234.00"
|
||||
|
||||
|
||||
class TestNormalizeDate:
|
||||
def test_dayfirst_dotted(self) -> None:
|
||||
assert normalize_date("31.03.2026") == "2026-03-31"
|
||||
|
||||
def test_iso_date(self) -> None:
|
||||
assert normalize_date("2026-03-31") == "2026-03-31"
|
||||
|
||||
def test_date_object(self) -> None:
|
||||
assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
|
||||
|
||||
def test_datetime_object(self) -> None:
|
||||
assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
|
||||
|
||||
def test_slash_variant(self) -> None:
|
||||
assert normalize_date("31/03/2026") == "2026-03-31"
|
||||
|
||||
|
||||
class TestNormalizeIban:
|
||||
def test_uppercase_and_strip_whitespace(self) -> None:
|
||||
assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
|
||||
|
||||
def test_already_normalised(self) -> None:
|
||||
assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
|
||||
|
||||
def test_tabs_and_newlines(self) -> None:
|
||||
assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
|
||||
|
||||
|
||||
class TestShouldSkipTextAgreement:
|
||||
def test_short_string_skipped(self) -> None:
|
||||
assert should_skip_text_agreement("AB", str) is True
|
||||
|
||||
def test_long_string_not_skipped(self) -> None:
|
||||
assert should_skip_text_agreement("ABC", str) is False
|
||||
|
||||
def test_number_abs_lt_10_skipped(self) -> None:
|
||||
assert should_skip_text_agreement(0, int) is True
|
||||
assert should_skip_text_agreement(9, int) is True
|
||||
assert should_skip_text_agreement(-9, int) is True
|
||||
assert should_skip_text_agreement(9.5, float) is True
|
||||
assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
|
||||
|
||||
def test_number_abs_ge_10_not_skipped(self) -> None:
|
||||
assert should_skip_text_agreement(10, int) is False
|
||||
assert should_skip_text_agreement(-10, int) is False
|
||||
assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
|
||||
|
||||
def test_literal_type_skipped(self) -> None:
|
||||
lit = Literal["checking", "credit", "savings"]
|
||||
assert should_skip_text_agreement("checking", lit) is True
|
||||
|
||||
def test_none_value_skipped(self) -> None:
|
||||
assert should_skip_text_agreement(None, str) is True
|
||||
assert should_skip_text_agreement(None, None) is True
|
||||
|
||||
def test_numeric_string_treated_as_string(self) -> None:
|
||||
# Short stringified numeric values still trip the short-value rule.
|
||||
assert should_skip_text_agreement("9", str) is True
|
||||
Loading…
Reference in a new issue