infoxtractor/tests/unit/test_provenance_normalize.py
Dirk Riemann 527fc620fe
All checks were successful
tests / test (pull_request) Successful in 1m0s
tests / test (push) Successful in 1m28s
feat(provenance): normalisers + short-value skip rule (spec §6)
Pure functions the ReliabilityStep will compose to compare extracted values
against OCR snippets (and context.texts). Kept in one module so every rule
is directly unit-testable without pulling in the step ABC.

Highlights:

- `normalize_string`: NFKC + casefold + strip common punctuation (. , : ; !
  ? () [] {} / \\ ' " `) + collapse whitespace. Substring-compatible.

- `normalize_number`: returns the canonical "[-]DDD.DD" form (always 2dp)
  after stripping currency symbols. Heuristic separator detection handles
  Swiss-German apostrophes ("1'234.56"), de-DE commas ("1.234,56"), and
  plain ASCII ("1234.56" / "1234.5" / "1234"). Accepts native int/float/
  Decimal as well as str.

- `normalize_date`: dateutil parse with dayfirst=True → ISO YYYY-MM-DD.
  Date and datetime objects short-circuit to their isoformat().

- `normalize_iban`: uppercase + strip whitespace. Format validation is the
  call site's job; this is pure canonicalisation.

- `should_skip_text_agreement`: dispatches on type + value. Literal → skip,
  None → skip, numeric |v|<10 → skip, len(str) ≤ 2 → skip. Numeric check
  runs first so `10` (len("10")==2) is treated on the numeric side
  (not skipped) instead of tripping the string length rule.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 10:56:31 +02:00

124 lines
4.5 KiB
Python

"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""
from __future__ import annotations
from datetime import date, datetime
from decimal import Decimal
from typing import Literal
from ix.provenance.normalize import (
normalize_date,
normalize_iban,
normalize_number,
normalize_string,
should_skip_text_agreement,
)
class TestNormalizeString:
def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
assert normalize_string(" FOO bar!!! ") == "foo bar"
def test_nfkc_applied_for_fullwidth(self) -> None:
# Fullwidth capital letters should NFKC-decompose to ASCII.
fullwidth_ubs = "\uff35\uff22\uff33" # "UBS" in U+FF00 fullwidth block
assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"
def test_whitespace_collapse(self) -> None:
assert normalize_string("UBS Switzerland\tAG") == "ubs switzerland ag"
def test_strips_common_punctuation(self) -> None:
# Colons, commas, dots, semicolons, parens, slashes.
assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
"hello world foo bar baz qux"
)
def test_empty_string(self) -> None:
assert normalize_string("") == ""
class TestNormalizeNumber:
def test_chf_swiss_apostrophe_thousands(self) -> None:
assert normalize_number("CHF 1'234.56") == "1234.56"
def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
assert normalize_number("1.234,56 EUR") == "1234.56"
def test_negative_sign(self) -> None:
assert normalize_number("-123.45") == "-123.45"
assert normalize_number("CHF -1'234.56") == "-1234.56"
def test_int_input(self) -> None:
assert normalize_number(42) == "42.00"
def test_float_input(self) -> None:
assert normalize_number(1234.5) == "1234.50"
def test_decimal_input(self) -> None:
assert normalize_number(Decimal("1234.56")) == "1234.56"
def test_trailing_zero_is_canonicalised(self) -> None:
assert normalize_number("1234.5") == "1234.50"
def test_no_decimal_part(self) -> None:
assert normalize_number("1234") == "1234.00"
class TestNormalizeDate:
def test_dayfirst_dotted(self) -> None:
assert normalize_date("31.03.2026") == "2026-03-31"
def test_iso_date(self) -> None:
assert normalize_date("2026-03-31") == "2026-03-31"
def test_date_object(self) -> None:
assert normalize_date(date(2026, 3, 31)) == "2026-03-31"
def test_datetime_object(self) -> None:
assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"
def test_slash_variant(self) -> None:
assert normalize_date("31/03/2026") == "2026-03-31"
class TestNormalizeIban:
def test_uppercase_and_strip_whitespace(self) -> None:
assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"
def test_already_normalised(self) -> None:
assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"
def test_tabs_and_newlines(self) -> None:
assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"
class TestShouldSkipTextAgreement:
def test_short_string_skipped(self) -> None:
assert should_skip_text_agreement("AB", str) is True
def test_long_string_not_skipped(self) -> None:
assert should_skip_text_agreement("ABC", str) is False
def test_number_abs_lt_10_skipped(self) -> None:
assert should_skip_text_agreement(0, int) is True
assert should_skip_text_agreement(9, int) is True
assert should_skip_text_agreement(-9, int) is True
assert should_skip_text_agreement(9.5, float) is True
assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True
def test_number_abs_ge_10_not_skipped(self) -> None:
assert should_skip_text_agreement(10, int) is False
assert should_skip_text_agreement(-10, int) is False
assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False
def test_literal_type_skipped(self) -> None:
lit = Literal["checking", "credit", "savings"]
assert should_skip_text_agreement("checking", lit) is True
def test_none_value_skipped(self) -> None:
assert should_skip_text_agreement(None, str) is True
assert should_skip_text_agreement(None, None) is True
def test_numeric_string_treated_as_string(self) -> None:
# Short stringified numeric values still trip the short-value rule.
assert should_skip_text_agreement("9", str) is True