infoxtractor/src/ix/ocr/client.py

"""OCRClient Protocol (spec §6.2).

Structural typing: any object with an async ``ocr(pages) -> OCRResult``
method satisfies the Protocol. :class:`~ix.pipeline.ocr_step.OCRStep`
depends on the Protocol, not a concrete class, so swapping engines
(``FakeOCRClient`` in tests, ``SuryaOCRClient`` in prod) stays a wiring
change at the app factory.

Per-page source location (``files`` + ``page_metadata``) flows in as
optional kwargs: fakes ignore them; the real
:class:`~ix.ocr.surya_client.SuryaOCRClient` uses them to render each
page's pixels back from disk. Keeping these optional lets unit tests stay
pages-only while production wiring (Task 4.3) plumbs through the real
filesystem handles.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Protocol, runtime_checkable

from ix.contracts import OCRResult, Page


@runtime_checkable
class OCRClient(Protocol):
    """Async OCR backend.

    Implementations receive the flat page list the pipeline built in
    :class:`~ix.pipeline.setup_step.SetupStep` and return an
    :class:`~ix.contracts.OCRResult` with one :class:`~ix.contracts.Page`
    per input page (in the same order).
    """

    async def ocr(
        self,
        pages: list[Page],
        *,
        files: list[tuple[Path, str]] | None = None,
        page_metadata: list[Any] | None = None,
    ) -> OCRResult:
        """Run OCR over the input pages; return the structured result.

        ``files`` and ``page_metadata`` are optional for hermetic tests;
        real engines that need to re-render from disk read them.
        """
        ...


__all__ = ["OCRClient"]