infoxtractor/tests/unit/test_pipeline_end_to_end.py

"""End-to-end pipeline test with the fake OCR + GenAI clients (spec sections 6-9).

Feeds the committed ``tests/fixtures/synthetic_giro.pdf`` through the
full five-step pipeline with canned OCR + canned LLM responses.
Hermetic: no Surya, no Ollama, no network.
"""

from __future__ import annotations

from datetime import date
from decimal import Decimal
from pathlib import Path
from typing import Any

import pytest
from pydantic import BaseModel

from ix.contracts import (
    Context,
    Line,
    OCRDetails,
    OCROptions,
    OCRResult,
    Options,
    Page,
    ProvenanceOptions,
    RequestIX,
    SegmentCitation,
)
from ix.genai import FakeGenAIClient, GenAIUsage
from ix.ocr import FakeOCRClient
from ix.pipeline import Pipeline
from ix.pipeline.genai_step import GenAIStep
from ix.pipeline.ocr_step import OCRStep
from ix.pipeline.reliability_step import ReliabilityStep
from ix.pipeline.response_handler_step import ResponseHandlerStep
from ix.pipeline.setup_step import SetupStep
from ix.use_cases.bank_statement_header import BankStatementHeader

FIXTURE_PDF = Path(__file__).resolve().parent.parent / "fixtures" / "synthetic_giro.pdf"


# Ground-truth values. Must match the strings the fixture builder drops on
# the page AND the canned OCR output below.
EXPECTED_BANK_NAME = "DKB"
EXPECTED_IBAN = "DE89370400440532013000"
EXPECTED_OPENING = Decimal("1234.56")
EXPECTED_CLOSING = Decimal("1450.22")
EXPECTED_CURRENCY = "EUR"
EXPECTED_STATEMENT_DATE = date(2026, 3, 31)
EXPECTED_PERIOD_START = date(2026, 3, 1)
EXPECTED_PERIOD_END = date(2026, 3, 31)


def _canned_ocr_result() -> OCRResult:
    """Canned Surya-shaped result for the synthetic_giro fixture.

    Line texts match the strings placed by create_fixture_pdf.py. Bboxes
    are plausible-but-not-exact: the fixture builder uses 72 pt left
    margin and 24 pt line height on a 595x842 page, so we mirror those
    coords here so normalisation gives sensible 0-1 values.
    """
    width, height = 595.0, 842.0
    lines_meta = [
        ("DKB", 60.0),
        ("IBAN: DE89370400440532013000", 84.0),
        ("Statement period: 01.03.2026 - 31.03.2026", 108.0),
        ("Opening balance: 1234.56 EUR", 132.0),
        ("Closing balance: 1450.22 EUR", 156.0),
        ("Statement date: 31.03.2026", 180.0),
    ]
    lines: list[Line] = []
    for text, y_top in lines_meta:
        y_bot = y_top + 16.0
        lines.append(
            Line(
                text=text,
                bounding_box=[72.0, y_top, 500.0, y_top, 500.0, y_bot, 72.0, y_bot],
            )
        )
    return OCRResult(
        result=OCRDetails(
            text="\n".join(t for t, _ in lines_meta),
            pages=[
                Page(
                    page_no=1,
                    width=width,
                    height=height,
                    lines=lines,
                )
            ],
        ),
        meta_data={"engine": "fake"},
    )


class _WrappedResponse(BaseModel):
    """Mirrors the runtime ProvenanceWrappedResponse GenAIStep creates."""

    result: BankStatementHeader
    segment_citations: list[SegmentCitation] = []


def _canned_llm_output() -> _WrappedResponse:
    # After OCRStep injects <page> tag lines, the real OCR line at local
    # index 0 gets segment id p1_l0 (tag lines are skipped by
    # SegmentIndex.build). So:
    #   p1_l0 -> "DKB"
    #   p1_l1 -> "IBAN: DE89370400440532013000"
    #   p1_l2 -> "Statement period: 01.03.2026 - 31.03.2026"
    #   p1_l3 -> "Opening balance: 1234.56 EUR"
    #   p1_l4 -> "Closing balance: 1450.22 EUR"
    #   p1_l5 -> "Statement date: 31.03.2026"
    return _WrappedResponse(
        result=BankStatementHeader(
            bank_name=EXPECTED_BANK_NAME,
            account_iban=EXPECTED_IBAN,
            account_type="checking",
            currency=EXPECTED_CURRENCY,
            statement_date=EXPECTED_STATEMENT_DATE,
            statement_period_start=EXPECTED_PERIOD_START,
            statement_period_end=EXPECTED_PERIOD_END,
            opening_balance=EXPECTED_OPENING,
            closing_balance=EXPECTED_CLOSING,
        ),
        segment_citations=[
            SegmentCitation(
                field_path="result.bank_name",
                value_segment_ids=["p1_l0"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.account_iban",
                value_segment_ids=["p1_l1"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.account_type",
                value_segment_ids=[],
                context_segment_ids=["p1_l0"],
            ),
            SegmentCitation(
                field_path="result.currency",
                value_segment_ids=["p1_l3", "p1_l4"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.statement_date",
                value_segment_ids=["p1_l5"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.statement_period_start",
                value_segment_ids=["p1_l2"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.statement_period_end",
                value_segment_ids=["p1_l2"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.opening_balance",
                value_segment_ids=["p1_l3"],
                context_segment_ids=[],
            ),
            SegmentCitation(
                field_path="result.closing_balance",
                value_segment_ids=["p1_l4"],
                context_segment_ids=[],
            ),
        ],
    )


def _build_pipeline(fetch_config: Any = None) -> Pipeline:
    ocr_client = FakeOCRClient(canned=_canned_ocr_result())
    genai_client = FakeGenAIClient(
        parsed=_canned_llm_output(),
        usage=GenAIUsage(prompt_tokens=200, completion_tokens=400),
        model_name="fake-gpt",
    )
    setup = SetupStep(fetch_config=fetch_config) if fetch_config else SetupStep()
    return Pipeline(
        steps=[
            setup,
            OCRStep(ocr_client=ocr_client),
            GenAIStep(genai_client=genai_client),
            ReliabilityStep(),
            ResponseHandlerStep(),
        ]
    )


class TestEndToEnd:
    @pytest.fixture
    def request_ix(self, tmp_path: Path) -> RequestIX:
        # Canonical single-file request pointing to the committed fixture
        # via file:// URL. Also includes a matching Paperless-style text
        # so text_agreement has real data to compare against.
        paperless_text = (
            "DKB statement. IBAN: DE89370400440532013000. Period 01.03.2026 - "
            "31.03.2026. Opening balance 1234.56 EUR. Closing balance 1450.22 EUR. "
            "Date 31.03.2026."
        )
        return RequestIX(
            use_case="bank_statement_header",
            ix_client_id="mammon-test",
            request_id="end-to-end-1",
            ix_id="abcd0123ef456789",
            context=Context(
                files=[FIXTURE_PDF.as_uri()],
                texts=[paperless_text],
            ),
            options=Options(
                ocr=OCROptions(use_ocr=True),
                provenance=ProvenanceOptions(
                    include_provenance=True, max_sources_per_field=5
                ),
            ),
        )

    async def test_ix_result_populated_from_fake_llm(self, request_ix: RequestIX) -> None:
        pipeline = _build_pipeline()
        response = await pipeline.start(request_ix)
        assert response.error is None
        result = response.ix_result.result
        assert result["bank_name"] == EXPECTED_BANK_NAME
        assert result["account_iban"] == EXPECTED_IBAN
        assert result["currency"] == EXPECTED_CURRENCY
        # Pydantic v2 dumps Decimals as strings in mode="json".
        assert result["closing_balance"] == str(EXPECTED_CLOSING)

    async def test_provenance_verified_for_closing_balance(
        self, request_ix: RequestIX
    ) -> None:
        pipeline = _build_pipeline()
        response = await pipeline.start(request_ix)
        assert response.provenance is not None
        fp = response.provenance.fields["result.closing_balance"]
        assert fp.provenance_verified is True

    async def test_text_agreement_true_when_texts_match_value(
        self, request_ix: RequestIX
    ) -> None:
        pipeline = _build_pipeline()
        response = await pipeline.start(request_ix)
        assert response.provenance is not None
        fp = response.provenance.fields["result.closing_balance"]
        assert fp.text_agreement is True

    async def test_timings_per_step(self, request_ix: RequestIX) -> None:
        pipeline = _build_pipeline()
        response = await pipeline.start(request_ix)
        # Each of the five steps executed and recorded a timing.
        names = [t["step"] for t in response.metadata.timings]
        assert names == [
            "SetupStep",
            "OCRStep",
            "GenAIStep",
            "ReliabilityStep",
            "ResponseHandlerStep",
        ]
        for entry in response.metadata.timings:
            assert isinstance(entry["elapsed_seconds"], float)

    async def test_no_error_and_context_stripped(self, request_ix: RequestIX) -> None:
        pipeline = _build_pipeline()
        response = await pipeline.start(request_ix)
        assert response.error is None
        dump = response.model_dump()
        assert "context" not in dump