infoxtractor/tests/unit/test_provenance_verify.py

"""Tests for the reliability verifier (spec §6 ReliabilityStep)."""

from __future__ import annotations

from datetime import date
from decimal import Decimal
from typing import Literal

from pydantic import BaseModel

from ix.contracts import (
    ExtractionSource,
    FieldProvenance,
    ProvenanceData,
)
from ix.provenance.verify import apply_reliability_flags, verify_field


def _make_fp(
    *,
    field_path: str,
    value: object,
    snippets: list[str],
) -> FieldProvenance:
    return FieldProvenance(
        field_name=field_path.split(".")[-1],
        field_path=field_path,
        value=value,
        sources=[
            ExtractionSource(
                page_number=1,
                file_index=0,
                text_snippet=s,
                relevance_score=1.0,
                segment_id=f"p1_l{i}",
            )
            for i, s in enumerate(snippets)
        ],
    )


class TestVerifyFieldByType:
    def test_string_substring_match(self) -> None:
        fp = _make_fp(
            field_path="result.bank_name",
            value="UBS AG",
            snippets=["Account at UBS AG, Zurich"],
        )
        pv, ta = verify_field(fp, str, texts=[])
        assert pv is True
        assert ta is None

    def test_string_mismatch_is_false(self) -> None:
        fp = _make_fp(
            field_path="result.bank_name",
            value="UBS AG",
            snippets=["Credit Suisse"],
        )
        pv, _ = verify_field(fp, str, texts=[])
        assert pv is False

    def test_number_decimal_match_ignores_currency(self) -> None:
        fp = _make_fp(
            field_path="result.closing_balance",
            value=Decimal("1234.56"),
            snippets=["CHF 1'234.56"],
        )
        pv, _ = verify_field(fp, Decimal, texts=[])
        assert pv is True

    def test_number_mismatch(self) -> None:
        fp = _make_fp(
            field_path="result.closing_balance",
            value=Decimal("1234.56"),
            snippets=["CHF 9999.99"],
        )
        pv, _ = verify_field(fp, Decimal, texts=[])
        assert pv is False

    def test_date_parse_both_sides(self) -> None:
        fp = _make_fp(
            field_path="result.statement_date",
            value=date(2026, 3, 31),
            snippets=["Statement date: 31.03.2026"],
        )
        pv, _ = verify_field(fp, date, texts=[])
        assert pv is True

    def test_iban_strip_and_case(self) -> None:
        # IBAN detection: field name contains "iban".
        fp = _make_fp(
            field_path="result.account_iban",
            value="CH9300762011623852957",
            snippets=["Account CH93 0076 2011 6238 5295 7"],
        )
        pv, _ = verify_field(fp, str, texts=[])
        assert pv is True

    def test_literal_field_both_flags_none(self) -> None:
        fp = _make_fp(
            field_path="result.account_type",
            value="checking",
            snippets=["the word checking is literally here"],
        )
        pv, ta = verify_field(fp, Literal["checking", "credit", "savings"], texts=["checking"])
        assert pv is None
        assert ta is None

    def test_none_value_both_flags_none(self) -> None:
        fp = _make_fp(
            field_path="result.account_iban",
            value=None,
            snippets=["whatever"],
        )
        pv, ta = verify_field(fp, str, texts=["whatever"])
        assert pv is None
        assert ta is None


class TestTextAgreement:
    def test_text_agreement_with_texts_true(self) -> None:
        fp = _make_fp(
            field_path="result.bank_name",
            value="UBS AG",
            snippets=["UBS AG"],
        )
        _, ta = verify_field(fp, str, texts=["Account at UBS AG"])
        assert ta is True

    def test_text_agreement_with_texts_false(self) -> None:
        fp = _make_fp(
            field_path="result.bank_name",
            value="UBS AG",
            snippets=["UBS AG"],
        )
        _, ta = verify_field(fp, str, texts=["Credit Suisse"])
        assert ta is False

    def test_text_agreement_no_texts_is_none(self) -> None:
        fp = _make_fp(
            field_path="result.bank_name",
            value="UBS AG",
            snippets=["UBS AG"],
        )
        _, ta = verify_field(fp, str, texts=[])
        assert ta is None

    def test_short_value_skips_text_agreement(self) -> None:
        # 2-char string
        fp = _make_fp(
            field_path="result.code",
            value="XY",
            snippets=["code XY here"],
        )
        pv, ta = verify_field(fp, str, texts=["another XY reference"])
        # provenance_verified still runs; text_agreement is skipped.
        assert pv is True
        assert ta is None

    def test_small_number_skips_text_agreement(self) -> None:
        fp = _make_fp(
            field_path="result.n",
            value=5,
            snippets=["value 5 here"],
        )
        pv, ta = verify_field(fp, int, texts=["the number 5"])
        assert pv is True
        assert ta is None


class TestApplyReliabilityFlags:
    def test_writes_flags_and_counters(self) -> None:
        class BankHeader(BaseModel):
            bank_name: str
            account_iban: str | None = None
            closing_balance: Decimal | None = None
            account_type: Literal["checking", "credit", "savings"] | None = None

        prov = ProvenanceData(
            fields={
                "result.bank_name": _make_fp(
                    field_path="result.bank_name",
                    value="UBS AG",
                    snippets=["Account at UBS AG"],
                ),
                "result.account_iban": _make_fp(
                    field_path="result.account_iban",
                    value="CH9300762011623852957",
                    snippets=["IBAN CH93 0076 2011 6238 5295 7"],
                ),
                "result.closing_balance": _make_fp(
                    field_path="result.closing_balance",
                    value=Decimal("1234.56"),
                    snippets=["Closing balance CHF 1'234.56"],
                ),
                "result.account_type": _make_fp(
                    field_path="result.account_type",
                    value="checking",
                    snippets=["current account (checking)"],
                ),
            },
        )
        apply_reliability_flags(prov, BankHeader, texts=["Account at UBS AG at CH9300762011623852957"])

        fields = prov.fields
        assert fields["result.bank_name"].provenance_verified is True
        assert fields["result.bank_name"].text_agreement is True
        assert fields["result.account_iban"].provenance_verified is True
        assert fields["result.closing_balance"].provenance_verified is True
        # account_type is Literal → both flags None.
        assert fields["result.account_type"].provenance_verified is None
        assert fields["result.account_type"].text_agreement is None

        # Counters record only True values.
        qm = prov.quality_metrics
        assert qm["verified_fields"] == 3  # all except Literal
        # text_agreement_fields counts only fields where the flag is True.
        # bank_name True; IBAN True (appears in texts after normalisation);
        # closing_balance -- '1234.56' doesn't appear in the text.
        assert qm["text_agreement_fields"] >= 1