"""OCRStep — run OCR, inject page tags, build the SegmentIndex (spec §6.2). Runs after :class:`~ix.pipeline.setup_step.SetupStep`. Three things happen: 1. Dispatch the flat page list to the injected :class:`OCRClient` and write the raw :class:`~ix.contracts.OCRResult` onto the response. 2. Inject ```` / ```` tag lines around each page's content so the GenAIStep can ground citations visually (spec §6.2c). 3. When provenance is on, build a :class:`SegmentIndex` over the *non-tag* lines and stash it on the internal context. Validation follows the triad-or-use_ocr rule from the spec: if any of ``include_geometries`` / ``include_ocr_text`` / ``ocr_only`` is set but ``context.files`` is empty, raise ``IX_000_004``. If OCR isn't wanted (text-only request), return ``False`` to skip the step silently. """ from __future__ import annotations from ix.contracts import Line, RequestIX, ResponseIX from ix.errors import IXErrorCode, IXException from ix.ocr.client import OCRClient from ix.pipeline.step import Step from ix.segmentation import PageMetadata, SegmentIndex class OCRStep(Step): """Inject-and-index OCR stage.""" def __init__(self, ocr_client: OCRClient) -> None: self._client = ocr_client async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool: opts = request_ix.options.ocr ctx = response_ix.context files = list(getattr(ctx, "files", [])) if ctx is not None else [] ocr_artifacts_requested = ( opts.include_geometries or opts.include_ocr_text or opts.ocr_only ) if ocr_artifacts_requested and not files: raise IXException(IXErrorCode.IX_000_004) if not files: return False # OCR runs if use_ocr OR any of the artifact flags is set. return bool(opts.use_ocr or ocr_artifacts_requested) async def process( self, request_ix: RequestIX, response_ix: ResponseIX ) -> ResponseIX: ctx = response_ix.context assert ctx is not None, "SetupStep must populate response_ix.context" pages = list(getattr(ctx, "pages", [])) files = list(getattr(ctx, "files", []) or []) page_metadata = list(getattr(ctx, "page_metadata", []) or []) ocr_result = await self._client.ocr( pages, files=files, page_metadata=page_metadata ) # Inject page tags around each OCR page's content so the LLM can # cross-reference the visual anchor without a separate prompt hack. page_metadata: list[PageMetadata] = list( getattr(ctx, "page_metadata", []) or [] ) for idx, ocr_page in enumerate(ocr_result.result.pages): meta = page_metadata[idx] if idx < len(page_metadata) else PageMetadata() file_idx = meta.file_index if meta.file_index is not None else 0 open_tag = Line( text=f'', bounding_box=[], ) close_tag = Line(text="", bounding_box=[]) ocr_page.lines = [open_tag, *ocr_page.lines, close_tag] response_ix.ocr_result = ocr_result # Build SegmentIndex only when provenance is on. Segment IDs # deliberately skip page-tag lines (see SegmentIndex.build). if request_ix.options.provenance.include_provenance: seg_idx = SegmentIndex.build( ocr_result=ocr_result, granularity="line", pages_metadata=page_metadata, ) ctx.segment_index = seg_idx return response_ix __all__ = ["OCRStep"]