infoxtractor/tests/unit/test_provenance_normalize.py

"""Tests for the provenance normalisers (spec §6 ReliabilityStep)."""

from __future__ import annotations

from datetime import date, datetime
from decimal import Decimal
from typing import Literal

from ix.provenance.normalize import (
    normalize_date,
    normalize_iban,
    normalize_number,
    normalize_string,
    should_skip_text_agreement,
)


class TestNormalizeString:
    def test_uppercase_casefolded_and_punctuation_stripped(self) -> None:
        assert normalize_string("  FOO bar!!!  ") == "foo bar"

    def test_nfkc_applied_for_fullwidth(self) -> None:
        # Fullwidth capital letters should NFKC-decompose to ASCII.
        fullwidth_ubs = "\uff35\uff22\uff33"  # "UBS" in U+FF00 fullwidth block
        assert normalize_string(f"{fullwidth_ubs} AG") == "ubs ag"

    def test_whitespace_collapse(self) -> None:
        assert normalize_string("UBS   Switzerland\tAG") == "ubs switzerland ag"

    def test_strips_common_punctuation(self) -> None:
        # Colons, commas, dots, semicolons, parens, slashes.
        assert normalize_string("Hello, World. (foo); bar: baz / qux") == (
            "hello world foo bar baz qux"
        )

    def test_empty_string(self) -> None:
        assert normalize_string("") == ""


class TestNormalizeNumber:
    def test_chf_swiss_apostrophe_thousands(self) -> None:
        assert normalize_number("CHF 1'234.56") == "1234.56"

    def test_de_de_dot_thousands_and_comma_decimal(self) -> None:
        assert normalize_number("1.234,56 EUR") == "1234.56"

    def test_negative_sign(self) -> None:
        assert normalize_number("-123.45") == "-123.45"
        assert normalize_number("CHF -1'234.56") == "-1234.56"

    def test_int_input(self) -> None:
        assert normalize_number(42) == "42.00"

    def test_float_input(self) -> None:
        assert normalize_number(1234.5) == "1234.50"

    def test_decimal_input(self) -> None:
        assert normalize_number(Decimal("1234.56")) == "1234.56"

    def test_trailing_zero_is_canonicalised(self) -> None:
        assert normalize_number("1234.5") == "1234.50"

    def test_no_decimal_part(self) -> None:
        assert normalize_number("1234") == "1234.00"


class TestNormalizeDate:
    def test_dayfirst_dotted(self) -> None:
        assert normalize_date("31.03.2026") == "2026-03-31"

    def test_iso_date(self) -> None:
        assert normalize_date("2026-03-31") == "2026-03-31"

    def test_date_object(self) -> None:
        assert normalize_date(date(2026, 3, 31)) == "2026-03-31"

    def test_datetime_object(self) -> None:
        assert normalize_date(datetime(2026, 3, 31, 10, 30)) == "2026-03-31"

    def test_slash_variant(self) -> None:
        assert normalize_date("31/03/2026") == "2026-03-31"


class TestNormalizeIban:
    def test_uppercase_and_strip_whitespace(self) -> None:
        assert normalize_iban("de 89 3704 0044 0532 0130 00") == "DE89370400440532013000"

    def test_already_normalised(self) -> None:
        assert normalize_iban("CH9300762011623852957") == "CH9300762011623852957"

    def test_tabs_and_newlines(self) -> None:
        assert normalize_iban("ch 93\t0076\n2011623852957") == "CH9300762011623852957"


class TestShouldSkipTextAgreement:
    def test_short_string_skipped(self) -> None:
        assert should_skip_text_agreement("AB", str) is True

    def test_long_string_not_skipped(self) -> None:
        assert should_skip_text_agreement("ABC", str) is False

    def test_number_abs_lt_10_skipped(self) -> None:
        assert should_skip_text_agreement(0, int) is True
        assert should_skip_text_agreement(9, int) is True
        assert should_skip_text_agreement(-9, int) is True
        assert should_skip_text_agreement(9.5, float) is True
        assert should_skip_text_agreement(Decimal("9.99"), Decimal) is True

    def test_number_abs_ge_10_not_skipped(self) -> None:
        assert should_skip_text_agreement(10, int) is False
        assert should_skip_text_agreement(-10, int) is False
        assert should_skip_text_agreement(Decimal("1234.56"), Decimal) is False

    def test_literal_type_skipped(self) -> None:
        lit = Literal["checking", "credit", "savings"]
        assert should_skip_text_agreement("checking", lit) is True

    def test_none_value_skipped(self) -> None:
        assert should_skip_text_agreement(None, str) is True
        assert should_skip_text_agreement(None, None) is True

    def test_numeric_string_treated_as_string(self) -> None:
        # Short stringified numeric values still trip the short-value rule.
        assert should_skip_text_agreement("9", str) is True