Compare commits
No commits in common. "2d2211589387990801bf9190c64c592731b03ca3" and "b2ff27c1ca35674350ce5051eae50bc322ec0dbb" have entirely different histories.
2d22115893
...
b2ff27c1ca
3 changed files with 0 additions and 337 deletions
|
|
@ -1,32 +0,0 @@
|
||||||
"""Provenance subsystem — normalisers, mapper, verifier.
|
|
||||||
|
|
||||||
Three pieces compose the reliability check:
|
|
||||||
|
|
||||||
* :mod:`ix.provenance.normalize` — pure text/number/date/IBAN normalisers
|
|
||||||
used to compare OCR snippets to extracted values.
|
|
||||||
* :mod:`ix.provenance.mapper` — resolves LLM-emitted segment IDs to
|
|
||||||
:class:`~ix.contracts.provenance.FieldProvenance` entries.
|
|
||||||
* :mod:`ix.provenance.verify` — per-field-type dispatcher that writes the
|
|
||||||
``provenance_verified`` / ``text_agreement`` flags.
|
|
||||||
|
|
||||||
Only :mod:`normalize` is exported from the package at this step; the mapper
|
|
||||||
and verifier land in task 1.8.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from ix.provenance.normalize import (
|
|
||||||
normalize_date,
|
|
||||||
normalize_iban,
|
|
||||||
normalize_number,
|
|
||||||
normalize_string,
|
|
||||||
should_skip_text_agreement,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"normalize_date",
|
|
||||||
"normalize_iban",
|
|
||||||
"normalize_number",
|
|
||||||
"normalize_string",
|
|
||||||
"should_skip_text_agreement",
|
|
||||||
]
|
|
||||||
|
|
@ -1,181 +0,0 @@
|
||||||
"""Pure normalisers used by the reliability check (spec §6).
|
|
||||||
|
|
||||||
The ReliabilityStep compares extracted values against OCR segment snippets
|
|
||||||
(and raw ``context.texts``) after passing both sides through the same
|
|
||||||
normaliser. Keeping these functions pure (no IO, no state) means the
|
|
||||||
ReliabilityStep itself can stay a thin dispatcher and every rule is
|
|
||||||
directly unit-testable.
|
|
||||||
|
|
||||||
All normalisers return ``str`` so the downstream ``substring`` / ``equals``
|
|
||||||
comparison is trivial.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
import unicodedata
|
|
||||||
from datetime import date, datetime
|
|
||||||
from decimal import Decimal, InvalidOperation
|
|
||||||
from typing import Any, get_origin
|
|
||||||
|
|
||||||
from dateutil import parser as _dateparser
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# String
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Strip punctuation that rarely carries semantics in extracted vs. OCR compare:
|
|
||||||
# colon, comma, dot, semicolon, paren/bracket, slash, exclamation, question.
|
|
||||||
_PUNCTUATION_RE = re.compile(r"[.,:;!?()\[\]{}/\\'\"`]")
|
|
||||||
_WHITESPACE_RE = re.compile(r"\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_string(s: str) -> str:
|
|
||||||
"""NFKC + casefold + punctuation strip + whitespace collapse."""
|
|
||||||
s = unicodedata.normalize("NFKC", s)
|
|
||||||
s = s.casefold()
|
|
||||||
s = _PUNCTUATION_RE.sub(" ", s)
|
|
||||||
s = _WHITESPACE_RE.sub(" ", s).strip()
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Number
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Strip currency symbols / codes and everything that isn't a digit, sign,
|
|
||||||
# apostrophe, dot, or comma. The apostrophe/dot/comma handling is done in a
|
|
||||||
# second pass that figures out thousands-separator vs. decimal-separator from
|
|
||||||
# structure.
|
|
||||||
_NUMERIC_KEEP_RE = re.compile(r"[^0-9.,'\s\-+]")
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_numeric_string(raw: str) -> Decimal:
|
|
||||||
"""Heuristically decode localised numbers.
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
|
|
||||||
* Strip anything that isn't a digit, sign, dot, comma, apostrophe, or
|
|
||||||
whitespace (this drops currency symbols / codes).
|
|
||||||
* Apostrophes are always thousands separators (Swiss-German style).
|
|
||||||
* Whitespace is always a thousands separator (fr-FR style).
|
|
||||||
* If both ``.`` and ``,`` appear, the rightmost is the decimal separator
|
|
||||||
and the other is the thousands separator.
|
|
||||||
* If only one of them appears: assume it's the decimal separator when it
|
|
||||||
has exactly 2 trailing digits, otherwise a thousands separator.
|
|
||||||
"""
|
|
||||||
cleaned = _NUMERIC_KEEP_RE.sub("", raw).strip()
|
|
||||||
cleaned = cleaned.replace("'", "").replace(" ", "")
|
|
||||||
|
|
||||||
has_dot = "." in cleaned
|
|
||||||
has_comma = "," in cleaned
|
|
||||||
|
|
||||||
if has_dot and has_comma:
|
|
||||||
if cleaned.rfind(".") > cleaned.rfind(","):
|
|
||||||
# dot is decimal
|
|
||||||
cleaned = cleaned.replace(",", "")
|
|
||||||
else:
|
|
||||||
# comma is decimal
|
|
||||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
|
||||||
elif has_comma:
|
|
||||||
# Only comma — treat as decimal if 2 digits follow, else thousands.
|
|
||||||
tail = cleaned.split(",")[-1]
|
|
||||||
if len(tail) == 2 and tail.isdigit():
|
|
||||||
cleaned = cleaned.replace(",", ".")
|
|
||||||
else:
|
|
||||||
cleaned = cleaned.replace(",", "")
|
|
||||||
elif has_dot:
|
|
||||||
# Only dot — same heuristic in reverse. If multiple dots appear they
|
|
||||||
# must be thousands separators (e.g. "1.234.567"); strip them. A
|
|
||||||
# single dot with a non-2-digit tail stays as-is (1.5 is 1.5).
|
|
||||||
tail = cleaned.split(".")[-1]
|
|
||||||
if (len(tail) != 2 or not tail.isdigit()) and cleaned.count(".") > 1:
|
|
||||||
cleaned = cleaned.replace(".", "")
|
|
||||||
if cleaned in ("", "+", "-"):
|
|
||||||
raise InvalidOperation(f"cannot parse number: {raw!r}")
|
|
||||||
return Decimal(cleaned)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_number(value: int | float | Decimal | str) -> str:
|
|
||||||
"""Return ``"[-]DDD.DD"`` canonical form — always 2 decimal places.
|
|
||||||
|
|
||||||
Accepts localized strings (``"CHF 1'234.56"``, ``"1.234,56 EUR"``,
|
|
||||||
``"-123.45"``) as well as native numeric types.
|
|
||||||
"""
|
|
||||||
if isinstance(value, Decimal):
|
|
||||||
dec = value
|
|
||||||
elif isinstance(value, (int, float)):
|
|
||||||
dec = Decimal(str(value))
|
|
||||||
else:
|
|
||||||
dec = _parse_numeric_string(value)
|
|
||||||
# Quantize to 2dp; keep sign.
|
|
||||||
quantized = dec.quantize(Decimal("0.01"))
|
|
||||||
return format(quantized, "f")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Date
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_date(value: date | datetime | str) -> str:
|
|
||||||
"""Parse via dateutil (dayfirst=True) and return ISO ``YYYY-MM-DD``."""
|
|
||||||
if isinstance(value, datetime):
|
|
||||||
return value.date().isoformat()
|
|
||||||
if isinstance(value, date):
|
|
||||||
return value.isoformat()
|
|
||||||
parsed = _dateparser.parse(value, dayfirst=True)
|
|
||||||
return parsed.date().isoformat()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# IBAN
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_iban(s: str) -> str:
|
|
||||||
"""Upper-case + strip all whitespace. No format validation (call site's job)."""
|
|
||||||
return "".join(s.split()).upper()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Short-value skip rule
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def should_skip_text_agreement(value: Any, field_type: Any) -> bool:
|
|
||||||
"""Return True when ``text_agreement`` should be recorded as ``None``.
|
|
||||||
|
|
||||||
Rules (spec §6 ReliabilityStep):
|
|
||||||
|
|
||||||
1. ``value is None`` → skip.
|
|
||||||
2. ``field_type`` is a ``Literal[...]`` → skip (enum labels don't appear
|
|
||||||
verbatim in the source text).
|
|
||||||
3. Stringified value length ≤ 2 chars → skip (short strings collide with
|
|
||||||
random OCR noise).
|
|
||||||
4. Numeric value (int/float/Decimal) with ``|v| < 10`` → skip.
|
|
||||||
|
|
||||||
``provenance_verified`` still runs for all of these — the bbox-anchored
|
|
||||||
cite is stronger than a global text scan for short values.
|
|
||||||
"""
|
|
||||||
if value is None:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Literal check — Python 3.12 returns `typing.Literal` from get_origin.
|
|
||||||
import typing
|
|
||||||
|
|
||||||
if get_origin(field_type) is typing.Literal:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Numeric short-value rule — check before the stringified-length rule so
|
|
||||||
# that "10" (len 2) is still considered on the numeric side. Booleans
|
|
||||||
# are a subtype of int; we exclude them so they fall through to the
|
|
||||||
# string rule ("True" has len 4 so it doesn't trip anyway).
|
|
||||||
if not isinstance(value, bool) and isinstance(value, (int, float, Decimal)):
|
|
||||||
try:
|
|
||||||
return abs(Decimal(str(value))) < 10
|
|
||||||
except InvalidOperation:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Stringified length rule (strings and anything not numeric).
|
|
||||||
return len(str(value)) <= 2
|
|
||||||
|
|
@ -1,124 +0,0 @@
|
||||||
"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from datetime import date, datetime
|
|
||||||
from decimal import Decimal
|
|
||||||
from typing import Literal
|
|
||||||
|
|
||||||
from ix.provenance.normalize import (
|
|
||||||
normalize_date,
|
|
||||||
normalize_iban,
|
|
||||||
normalize_number,
|
|
||||||
normalize_string,
|
|
||||||
should_skip_text_agreement,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestNormalizeString:
|
|
||||||
def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
|
|
||||||
assert normalize_string(" FOO bar!!! ") == "foo bar"
|
|
||||||
|
|
||||||
def test_nfkc_applied_for_fullwidth(self) -> None:
|
|
||||||
# Fullwidth capital letters should NFKC-decompose to ASCII.
|
|
||||||
fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block
|
|
||||||
assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
|
|
||||||
|
|
||||||
def test_whitespace_collapse(self) -> None:
|
|
||||||
assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag"
|
|
||||||
|
|
||||||
def test_strips_common_punctuation(self) -> None:
|
|
||||||
# Colons, commas, dots, semicolons, parens, slashes.
|
|
||||||
assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
|
|
||||||
"hello world foo bar baz qux"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_empty_string(self) -> None:
|
|
||||||
assert normalize_string("") == ""
|
|
||||||
|
|
||||||
|
|
||||||
class TestNormalizeNumber:
|
|
||||||
def test_chf_swiss_apostrophe_thousands(self) -> None:
|
|
||||||
assert normalize_number("CHF 1'234.56") == "1234.56"
|
|
||||||
|
|
||||||
def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
|
|
||||||
assert normalize_number("1.234,56 EUR") == "1234.56"
|
|
||||||
|
|
||||||
def test_negative_sign(self) -> None:
|
|
||||||
assert normalize_number("-123.45") == "-123.45"
|
|
||||||
assert normalize_number("CHF -1'234.56") == "-1234.56"
|
|
||||||
|
|
||||||
def test_int_input(self) -> None:
|
|
||||||
assert normalize_number(42) == "42.00"
|
|
||||||
|
|
||||||
def test_float_input(self) -> None:
|
|
||||||
assert normalize_number(1234.5) == "1234.50"
|
|
||||||
|
|
||||||
def test_decimal_input(self) -> None:
|
|
||||||
assert normalize_number(Decimal("1234.56")) == "1234.56"
|
|
||||||
|
|
||||||
def test_trailing_zero_is_canonicalised(self) -> None:
|
|
||||||
assert normalize_number("1234.5") == "1234.50"
|
|
||||||
|
|
||||||
def test_no_decimal_part(self) -> None:
|
|
||||||
assert normalize_number("1234") == "1234.00"
|
|
||||||
|
|
||||||
|
|
||||||
class TestNormalizeDate:
|
|
||||||
def test_dayfirst_dotted(self) -> None:
|
|
||||||
assert normalize_date("31.03.2026") == "2026-03-31"
|
|
||||||
|
|
||||||
def test_iso_date(self) -> None:
|
|
||||||
assert normalize_date("2026-03-31") == "2026-03-31"
|
|
||||||
|
|
||||||
def test_date_object(self) -> None:
|
|
||||||
assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
|
|
||||||
|
|
||||||
def test_datetime_object(self) -> None:
|
|
||||||
assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
|
|
||||||
|
|
||||||
def test_slash_variant(self) -> None:
|
|
||||||
assert normalize_date("31/03/2026") == "2026-03-31"
|
|
||||||
|
|
||||||
|
|
||||||
class TestNormalizeIban:
|
|
||||||
def test_uppercase_and_strip_whitespace(self) -> None:
|
|
||||||
assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
|
|
||||||
|
|
||||||
def test_already_normalised(self) -> None:
|
|
||||||
assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
|
|
||||||
|
|
||||||
def test_tabs_and_newlines(self) -> None:
|
|
||||||
assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
|
|
||||||
|
|
||||||
|
|
||||||
class TestShouldSkipTextAgreement:
|
|
||||||
def test_short_string_skipped(self) -> None:
|
|
||||||
assert should_skip_text_agreement("AB", str) is True
|
|
||||||
|
|
||||||
def test_long_string_not_skipped(self) -> None:
|
|
||||||
assert should_skip_text_agreement("ABC", str) is False
|
|
||||||
|
|
||||||
def test_number_abs_lt_10_skipped(self) -> None:
|
|
||||||
assert should_skip_text_agreement(0, int) is True
|
|
||||||
assert should_skip_text_agreement(9, int) is True
|
|
||||||
assert should_skip_text_agreement(-9, int) is True
|
|
||||||
assert should_skip_text_agreement(9.5, float) is True
|
|
||||||
assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
|
|
||||||
|
|
||||||
def test_number_abs_ge_10_not_skipped(self) -> None:
|
|
||||||
assert should_skip_text_agreement(10, int) is False
|
|
||||||
assert should_skip_text_agreement(-10, int) is False
|
|
||||||
assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
|
|
||||||
|
|
||||||
def test_literal_type_skipped(self) -> None:
|
|
||||||
lit = Literal["checking", "credit", "savings"]
|
|
||||||
assert should_skip_text_agreement("checking", lit) is True
|
|
||||||
|
|
||||||
def test_none_value_skipped(self) -> None:
|
|
||||||
assert should_skip_text_agreement(None, str) is True
|
|
||||||
assert should_skip_text_agreement(None, None) is True
|
|
||||||
|
|
||||||
def test_numeric_string_treated_as_string(self) -> None:
|
|
||||||
# Short stringified numeric values still trip the short-value rule.
|
|
||||||
assert should_skip_text_agreement("9", str) is True
|
|
||||||
Loading…
Reference in a new issue