Pure functions the ReliabilityStep will compose to compare extracted values
against OCR snippets (and context.texts). Kept in one module so every rule
is directly unit-testable without pulling in the step ABC.
Highlights:
- `normalize_string`: NFKC + casefold + strip common punctuation (. , : ; !
? () [] {} / \\ ' " `) + collapse whitespace. Substring-compatible.
- `normalize_number`: returns the canonical "[-]DDD.DD" form (always 2dp)
after stripping currency symbols. Heuristic separator detection handles
Swiss-German apostrophes ("1'234.56"), de-DE commas ("1.234,56"), and
plain ASCII ("1234.56" / "1234.5" / "1234"). Accepts native int/float/
Decimal as well as str.
- `normalize_date`: dateutil parse with dayfirst=True → ISO YYYY-MM-DD.
Date and datetime objects short-circuit to their isoformat().
- `normalize_iban`: uppercase + strip whitespace. Format validation is the
call site's job; this is pure canonicalisation.
- `should_skip_text_agreement`: dispatches on type + value. Literal → skip,
None → skip, numeric |v|<10 → skip, len(str) ≤ 2 → skip. Numeric check
runs first so `10` (len("10")==2) is treated on the numeric side
(not skipped) instead of tripping the string length rule.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
124 lines
4.5 KiB
Python
124 lines
4.5 KiB
Python
"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date, datetime
|
|
from decimal import Decimal
|
|
from typing import Literal
|
|
|
|
from ix.provenance.normalize import (
|
|
normalize_date,
|
|
normalize_iban,
|
|
normalize_number,
|
|
normalize_string,
|
|
should_skip_text_agreement,
|
|
)
|
|
|
|
|
|
class TestNormalizeString:
|
|
def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
|
|
assert normalize_string(" FOO bar!!! ") == "foo bar"
|
|
|
|
def test_nfkc_applied_for_fullwidth(self) -> None:
|
|
# Fullwidth capital letters should NFKC-decompose to ASCII.
|
|
fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block
|
|
assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
|
|
|
|
def test_whitespace_collapse(self) -> None:
|
|
assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag"
|
|
|
|
def test_strips_common_punctuation(self) -> None:
|
|
# Colons, commas, dots, semicolons, parens, slashes.
|
|
assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
|
|
"hello world foo bar baz qux"
|
|
)
|
|
|
|
def test_empty_string(self) -> None:
|
|
assert normalize_string("") == ""
|
|
|
|
|
|
class TestNormalizeNumber:
|
|
def test_chf_swiss_apostrophe_thousands(self) -> None:
|
|
assert normalize_number("CHF 1'234.56") == "1234.56"
|
|
|
|
def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
|
|
assert normalize_number("1.234,56 EUR") == "1234.56"
|
|
|
|
def test_negative_sign(self) -> None:
|
|
assert normalize_number("-123.45") == "-123.45"
|
|
assert normalize_number("CHF -1'234.56") == "-1234.56"
|
|
|
|
def test_int_input(self) -> None:
|
|
assert normalize_number(42) == "42.00"
|
|
|
|
def test_float_input(self) -> None:
|
|
assert normalize_number(1234.5) == "1234.50"
|
|
|
|
def test_decimal_input(self) -> None:
|
|
assert normalize_number(Decimal("1234.56")) == "1234.56"
|
|
|
|
def test_trailing_zero_is_canonicalised(self) -> None:
|
|
assert normalize_number("1234.5") == "1234.50"
|
|
|
|
def test_no_decimal_part(self) -> None:
|
|
assert normalize_number("1234") == "1234.00"
|
|
|
|
|
|
class TestNormalizeDate:
|
|
def test_dayfirst_dotted(self) -> None:
|
|
assert normalize_date("31.03.2026") == "2026-03-31"
|
|
|
|
def test_iso_date(self) -> None:
|
|
assert normalize_date("2026-03-31") == "2026-03-31"
|
|
|
|
def test_date_object(self) -> None:
|
|
assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
|
|
|
|
def test_datetime_object(self) -> None:
|
|
assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
|
|
|
|
def test_slash_variant(self) -> None:
|
|
assert normalize_date("31/03/2026") == "2026-03-31"
|
|
|
|
|
|
class TestNormalizeIban:
|
|
def test_uppercase_and_strip_whitespace(self) -> None:
|
|
assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
|
|
|
|
def test_already_normalised(self) -> None:
|
|
assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
|
|
|
|
def test_tabs_and_newlines(self) -> None:
|
|
assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
|
|
|
|
|
|
class TestShouldSkipTextAgreement:
|
|
def test_short_string_skipped(self) -> None:
|
|
assert should_skip_text_agreement("AB", str) is True
|
|
|
|
def test_long_string_not_skipped(self) -> None:
|
|
assert should_skip_text_agreement("ABC", str) is False
|
|
|
|
def test_number_abs_lt_10_skipped(self) -> None:
|
|
assert should_skip_text_agreement(0, int) is True
|
|
assert should_skip_text_agreement(9, int) is True
|
|
assert should_skip_text_agreement(-9, int) is True
|
|
assert should_skip_text_agreement(9.5, float) is True
|
|
assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
|
|
|
|
def test_number_abs_ge_10_not_skipped(self) -> None:
|
|
assert should_skip_text_agreement(10, int) is False
|
|
assert should_skip_text_agreement(-10, int) is False
|
|
assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
|
|
|
|
def test_literal_type_skipped(self) -> None:
|
|
lit = Literal["checking", "credit", "savings"]
|
|
assert should_skip_text_agreement("checking", lit) is True
|
|
|
|
def test_none_value_skipped(self) -> None:
|
|
assert should_skip_text_agreement(None, str) is True
|
|
assert should_skip_text_agreement(None, None) is True
|
|
|
|
def test_numeric_string_treated_as_string(self) -> None:
|
|
# Short stringified numeric values still trip the short-value rule.
|
|
assert should_skip_text_agreement("9", str) is True
|