"""Live test for :class:`SuryaOCRClient` — gated on ``IX_TEST_OLLAMA=1``. Downloads real Surya models (hundreds of MB) on first run. Never runs in CI. Exercised locally with:: IX_TEST_OLLAMA=1 uv run pytest tests/live/test_surya_client_live.py -v Note: requires the ``[ocr]`` extra — ``uv sync --extra ocr --extra dev``. """ from __future__ import annotations import os from pathlib import Path import pytest from ix.contracts import Page from ix.segmentation import PageMetadata pytestmark = [ pytest.mark.live, pytest.mark.skipif( os.environ.get("IX_TEST_OLLAMA") != "1", reason="live: IX_TEST_OLLAMA=1 required", ), ] async def test_extracts_dkb_and_iban_from_synthetic_giro() -> None: """Real Surya run against ``tests/fixtures/synthetic_giro.pdf``. Assert the flat text contains ``"DKB"`` and the IBAN without spaces. """ from ix.ocr.surya_client import SuryaOCRClient fixture = Path(__file__).parent.parent / "fixtures" / "synthetic_giro.pdf" assert fixture.exists(), f"missing fixture: {fixture}" # Build Pages the way DocumentIngestor would for this PDF: count pages # via PyMuPDF so we pass the right number of inputs. import fitz doc = fitz.open(str(fixture)) try: pages = [ Page( page_no=i + 1, width=float(p.rect.width), height=float(p.rect.height), lines=[], ) for i, p in enumerate(doc) ] finally: doc.close() client = SuryaOCRClient() result = await client.ocr( pages, files=[(fixture, "application/pdf")], page_metadata=[PageMetadata(file_index=0) for _ in pages], ) flat_text = result.result.text or "" # Join page-level line texts if flat text missing (shape-safety). if not flat_text: flat_text = "\n".join( line.text or "" for page in result.result.pages for line in page.lines ) assert "DKB" in flat_text assert "DE89370400440532013000" in flat_text.replace(" ", "") async def test_selfcheck_ok_against_real_predictors() -> None: """``selfcheck()`` returns ``ok`` once Surya's predictors load.""" from ix.ocr.surya_client import SuryaOCRClient client = SuryaOCRClient() assert await client.selfcheck() == "ok"