infoxtractor/tests/live/test_surya_client_live.py

"""Live test for :class:`SuryaOCRClient` — gated on ``IX_TEST_OLLAMA=1``.

Downloads real Surya models (hundreds of MB) on first run. Never runs in
CI. Exercised locally with::

    IX_TEST_OLLAMA=1 uv run pytest tests/live/test_surya_client_live.py -v

Note: requires the ``[ocr]`` extra — ``uv sync --extra ocr --extra dev``.
"""

from __future__ import annotations

import os
from pathlib import Path

import pytest

from ix.contracts import Page
from ix.segmentation import PageMetadata

pytestmark = [
    pytest.mark.live,
    pytest.mark.skipif(
        os.environ.get("IX_TEST_OLLAMA") != "1",
        reason="live: IX_TEST_OLLAMA=1 required",
    ),
]


async def test_extracts_dkb_and_iban_from_synthetic_giro() -> None:
    """Real Surya run against ``tests/fixtures/synthetic_giro.pdf``.

    Assert the flat text contains ``"DKB"`` and the IBAN without spaces.
    """
    from ix.ocr.surya_client import SuryaOCRClient

    fixture = Path(__file__).parent.parent / "fixtures" / "synthetic_giro.pdf"
    assert fixture.exists(), f"missing fixture: {fixture}"

    # Build Pages the way DocumentIngestor would for this PDF: count pages
    # via PyMuPDF so we pass the right number of inputs.
    import fitz

    doc = fitz.open(str(fixture))
    try:
        pages = [
            Page(
                page_no=i + 1,
                width=float(p.rect.width),
                height=float(p.rect.height),
                lines=[],
            )
            for i, p in enumerate(doc)
        ]
    finally:
        doc.close()

    client = SuryaOCRClient()
    result = await client.ocr(
        pages,
        files=[(fixture, "application/pdf")],
        page_metadata=[PageMetadata(file_index=0) for _ in pages],
    )

    flat_text = result.result.text or ""
    # Join page-level line texts if flat text missing (shape-safety).
    if not flat_text:
        flat_text = "\n".join(
            line.text or ""
            for page in result.result.pages
            for line in page.lines
        )

    assert "DKB" in flat_text
    assert "DE89370400440532013000" in flat_text.replace(" ", "")


async def test_selfcheck_ok_against_real_predictors() -> None:
    """``selfcheck()`` returns ``ok`` once Surya's predictors load."""
    from ix.ocr.surya_client import SuryaOCRClient

    client = SuryaOCRClient()
    assert await client.selfcheck() == "ok"