infoxtractor/src/ix/pipeline/ocr_step.py

"""OCRStep — run OCR, inject page tags, build the SegmentIndex (spec §6.2).

Runs after :class:`~ix.pipeline.setup_step.SetupStep`. Three things
happen:

1. Dispatch the flat page list to the injected :class:`OCRClient` and
   write the raw :class:`~ix.contracts.OCRResult` onto the response.
2. Inject ``<page file="..." number="...">`` / ``</page>`` tag lines
   around each page's content so the GenAIStep can ground citations
   visually (spec §6.2c).
3. When provenance is on, build a :class:`SegmentIndex` over the
   *non-tag* lines and stash it on the internal context.

Validation follows the triad-or-use_ocr rule from the spec: if any of
``include_geometries`` / ``include_ocr_text`` / ``ocr_only`` is set but
``context.files`` is empty, raise ``IX_000_004``. If OCR isn't wanted
(text-only request), return ``False`` to skip the step silently.
"""

from __future__ import annotations

from ix.contracts import Line, RequestIX, ResponseIX
from ix.errors import IXErrorCode, IXException
from ix.ocr.client import OCRClient
from ix.pipeline.step import Step
from ix.segmentation import PageMetadata, SegmentIndex


class OCRStep(Step):
    """Inject-and-index OCR stage."""

    def __init__(self, ocr_client: OCRClient) -> None:
        self._client = ocr_client

    async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool:
        opts = request_ix.options.ocr
        ctx = response_ix.context
        files = list(getattr(ctx, "files", [])) if ctx is not None else []

        ocr_artifacts_requested = (
            opts.include_geometries or opts.include_ocr_text or opts.ocr_only
        )
        if ocr_artifacts_requested and not files:
            raise IXException(IXErrorCode.IX_000_004)

        if not files:
            return False

        # OCR runs if use_ocr OR any of the artifact flags is set.
        return bool(opts.use_ocr or ocr_artifacts_requested)

    async def process(
        self, request_ix: RequestIX, response_ix: ResponseIX
    ) -> ResponseIX:
        ctx = response_ix.context
        assert ctx is not None, "SetupStep must populate response_ix.context"

        pages = list(getattr(ctx, "pages", []))
        files = list(getattr(ctx, "files", []) or [])
        page_metadata = list(getattr(ctx, "page_metadata", []) or [])
        ocr_result = await self._client.ocr(
            pages, files=files, page_metadata=page_metadata
        )

        # Inject page tags around each OCR page's content so the LLM can
        # cross-reference the visual anchor without a separate prompt hack.
        page_metadata: list[PageMetadata] = list(
            getattr(ctx, "page_metadata", []) or []
        )
        for idx, ocr_page in enumerate(ocr_result.result.pages):
            meta = page_metadata[idx] if idx < len(page_metadata) else PageMetadata()
            file_idx = meta.file_index if meta.file_index is not None else 0
            open_tag = Line(
                text=f'<page file="{file_idx}" number="{ocr_page.page_no}">',
                bounding_box=[],
            )
            close_tag = Line(text="</page>", bounding_box=[])
            ocr_page.lines = [open_tag, *ocr_page.lines, close_tag]

        response_ix.ocr_result = ocr_result

        # Build SegmentIndex only when provenance is on. Segment IDs
        # deliberately skip page-tag lines (see SegmentIndex.build).
        if request_ix.options.provenance.include_provenance:
            seg_idx = SegmentIndex.build(
                ocr_result=ocr_result,
                granularity="line",
                pages_metadata=page_metadata,
            )
            ctx.segment_index = seg_idx

        return response_ix


__all__ = ["OCRStep"]