Runs Surya's detection + recognition over PIL images rendered from each Page's source file (PDFs via PyMuPDF, images via Pillow). Lazy warm_up so FastAPI lifespan start stays predictable. Deferred Surya/torch imports keep the base install slim — the heavy deps stay under [ocr]. Extends OCRClient Protocol with optional files + page_metadata kwargs so the engine can resolve each page back to its on-disk source; Fake accepts-and-ignores to keep hermetic tests unchanged. selfcheck() runs the predictors on a 1x1 PIL image — wired into /healthz by Task 4.3. Tests: 6 hermetic unit tests (Surya predictors mocked, no model download); 2 live tests gated on IX_TEST_OLLAMA=1 (never run in CI). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
"""FakeOCRClient — returns a canned :class:`OCRResult` for hermetic tests.
|
|
|
|
Used by every pipeline unit test to avoid booting Surya / CUDA. The
|
|
``raise_on_call`` hook lets error-path tests exercise ``IX_002_000``-style
|
|
code paths without needing to forge network errors.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ix.contracts import OCRResult, Page
|
|
|
|
|
|
class FakeOCRClient:
|
|
"""Satisfies :class:`~ix.ocr.client.OCRClient` structurally.
|
|
|
|
Parameters
|
|
----------
|
|
canned:
|
|
The :class:`OCRResult` to return from every :meth:`ocr` call.
|
|
raise_on_call:
|
|
If set, :meth:`ocr` raises this exception instead of returning.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
canned: OCRResult,
|
|
*,
|
|
raise_on_call: BaseException | None = None,
|
|
) -> None:
|
|
self._canned = canned
|
|
self._raise_on_call = raise_on_call
|
|
|
|
async def ocr(
|
|
self,
|
|
pages: list[Page],
|
|
**_kwargs: object,
|
|
) -> OCRResult:
|
|
"""Return the canned result or raise the configured error.
|
|
|
|
Accepts (and ignores) any keyword args the production Protocol may
|
|
carry — keeps the fake swappable for :class:`SuryaOCRClient` at
|
|
call sites that pass ``files`` / ``page_metadata``.
|
|
"""
|
|
if self._raise_on_call is not None:
|
|
raise self._raise_on_call
|
|
return self._canned
|
|
|
|
|
|
__all__ = ["FakeOCRClient"]
|