Runs Surya's detection + recognition over PIL images rendered from each Page's source file (PDFs via PyMuPDF, images via Pillow). Lazy warm_up so FastAPI lifespan start stays predictable. Deferred Surya/torch imports keep the base install slim — the heavy deps stay under [ocr]. Extends OCRClient Protocol with optional files + page_metadata kwargs so the engine can resolve each page back to its on-disk source; Fake accepts-and-ignores to keep hermetic tests unchanged. selfcheck() runs the predictors on a 1x1 PIL image — wired into /healthz by Task 4.3. Tests: 6 hermetic unit tests (Surya predictors mocked, no model download); 2 live tests gated on IX_TEST_OLLAMA=1 (never run in CI). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
83 lines
2.3 KiB
Python
83 lines
2.3 KiB
Python
"""Live test for :class:`SuryaOCRClient` — gated on ``IX_TEST_OLLAMA=1``.
|
|
|
|
Downloads real Surya models (hundreds of MB) on first run. Never runs in
|
|
CI. Exercised locally with::
|
|
|
|
IX_TEST_OLLAMA=1 uv run pytest tests/live/test_surya_client_live.py -v
|
|
|
|
Note: requires the ``[ocr]`` extra — ``uv sync --extra ocr --extra dev``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from ix.contracts import Page
|
|
from ix.segmentation import PageMetadata
|
|
|
|
pytestmark = [
|
|
pytest.mark.live,
|
|
pytest.mark.skipif(
|
|
os.environ.get("IX_TEST_OLLAMA") != "1",
|
|
reason="live: IX_TEST_OLLAMA=1 required",
|
|
),
|
|
]
|
|
|
|
|
|
async def test_extracts_dkb_and_iban_from_synthetic_giro() -> None:
|
|
"""Real Surya run against ``tests/fixtures/synthetic_giro.pdf``.
|
|
|
|
Assert the flat text contains ``"DKB"`` and the IBAN without spaces.
|
|
"""
|
|
from ix.ocr.surya_client import SuryaOCRClient
|
|
|
|
fixture = Path(__file__).parent.parent / "fixtures" / "synthetic_giro.pdf"
|
|
assert fixture.exists(), f"missing fixture: {fixture}"
|
|
|
|
# Build Pages the way DocumentIngestor would for this PDF: count pages
|
|
# via PyMuPDF so we pass the right number of inputs.
|
|
import fitz
|
|
|
|
doc = fitz.open(str(fixture))
|
|
try:
|
|
pages = [
|
|
Page(
|
|
page_no=i + 1,
|
|
width=float(p.rect.width),
|
|
height=float(p.rect.height),
|
|
lines=[],
|
|
)
|
|
for i, p in enumerate(doc)
|
|
]
|
|
finally:
|
|
doc.close()
|
|
|
|
client = SuryaOCRClient()
|
|
result = await client.ocr(
|
|
pages,
|
|
files=[(fixture, "application/pdf")],
|
|
page_metadata=[PageMetadata(file_index=0) for _ in pages],
|
|
)
|
|
|
|
flat_text = result.result.text or ""
|
|
# Join page-level line texts if flat text missing (shape-safety).
|
|
if not flat_text:
|
|
flat_text = "\n".join(
|
|
line.text or ""
|
|
for page in result.result.pages
|
|
for line in page.lines
|
|
)
|
|
|
|
assert "DKB" in flat_text
|
|
assert "DE89370400440532013000" in flat_text.replace(" ", "")
|
|
|
|
|
|
async def test_selfcheck_ok_against_real_predictors() -> None:
|
|
"""``selfcheck()`` returns ``ok`` once Surya's predictors load."""
|
|
from ix.ocr.surya_client import SuryaOCRClient
|
|
|
|
client = SuryaOCRClient()
|
|
assert await client.selfcheck() == "ok"
|