Runs Surya's detection + recognition over PIL images rendered from each Page's source file (PDFs via PyMuPDF, images via Pillow). Lazy warm_up so FastAPI lifespan start stays predictable. Deferred Surya/torch imports keep the base install slim — the heavy deps stay under [ocr]. Extends OCRClient Protocol with optional files + page_metadata kwargs so the engine can resolve each page back to its on-disk source; Fake accepts-and-ignores to keep hermetic tests unchanged. selfcheck() runs the predictors on a 1x1 PIL image — wired into /healthz by Task 4.3. Tests: 6 hermetic unit tests (Surya predictors mocked, no model download); 2 live tests gated on IX_TEST_OLLAMA=1 (never run in CI). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
95 lines
3.6 KiB
Python
95 lines
3.6 KiB
Python
"""OCRStep — run OCR, inject page tags, build the SegmentIndex (spec §6.2).
|
|
|
|
Runs after :class:`~ix.pipeline.setup_step.SetupStep`. Three things
|
|
happen:
|
|
|
|
1. Dispatch the flat page list to the injected :class:`OCRClient` and
|
|
write the raw :class:`~ix.contracts.OCRResult` onto the response.
|
|
2. Inject ``<page file="..." number="...">`` / ``</page>`` tag lines
|
|
around each page's content so the GenAIStep can ground citations
|
|
visually (spec §6.2c).
|
|
3. When provenance is on, build a :class:`SegmentIndex` over the
|
|
*non-tag* lines and stash it on the internal context.
|
|
|
|
Validation follows the triad-or-use_ocr rule from the spec: if any of
|
|
``include_geometries`` / ``include_ocr_text`` / ``ocr_only`` is set but
|
|
``context.files`` is empty, raise ``IX_000_004``. If OCR isn't wanted
|
|
(text-only request), return ``False`` to skip the step silently.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ix.contracts import Line, RequestIX, ResponseIX
|
|
from ix.errors import IXErrorCode, IXException
|
|
from ix.ocr.client import OCRClient
|
|
from ix.pipeline.step import Step
|
|
from ix.segmentation import PageMetadata, SegmentIndex
|
|
|
|
|
|
class OCRStep(Step):
|
|
"""Inject-and-index OCR stage."""
|
|
|
|
def __init__(self, ocr_client: OCRClient) -> None:
|
|
self._client = ocr_client
|
|
|
|
async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool:
|
|
opts = request_ix.options.ocr
|
|
ctx = response_ix.context
|
|
files = list(getattr(ctx, "files", [])) if ctx is not None else []
|
|
|
|
ocr_artifacts_requested = (
|
|
opts.include_geometries or opts.include_ocr_text or opts.ocr_only
|
|
)
|
|
if ocr_artifacts_requested and not files:
|
|
raise IXException(IXErrorCode.IX_000_004)
|
|
|
|
if not files:
|
|
return False
|
|
|
|
# OCR runs if use_ocr OR any of the artifact flags is set.
|
|
return bool(opts.use_ocr or ocr_artifacts_requested)
|
|
|
|
async def process(
|
|
self, request_ix: RequestIX, response_ix: ResponseIX
|
|
) -> ResponseIX:
|
|
ctx = response_ix.context
|
|
assert ctx is not None, "SetupStep must populate response_ix.context"
|
|
|
|
pages = list(getattr(ctx, "pages", []))
|
|
files = list(getattr(ctx, "files", []) or [])
|
|
page_metadata = list(getattr(ctx, "page_metadata", []) or [])
|
|
ocr_result = await self._client.ocr(
|
|
pages, files=files, page_metadata=page_metadata
|
|
)
|
|
|
|
# Inject page tags around each OCR page's content so the LLM can
|
|
# cross-reference the visual anchor without a separate prompt hack.
|
|
page_metadata: list[PageMetadata] = list(
|
|
getattr(ctx, "page_metadata", []) or []
|
|
)
|
|
for idx, ocr_page in enumerate(ocr_result.result.pages):
|
|
meta = page_metadata[idx] if idx < len(page_metadata) else PageMetadata()
|
|
file_idx = meta.file_index if meta.file_index is not None else 0
|
|
open_tag = Line(
|
|
text=f'<page file="{file_idx}" number="{ocr_page.page_no}">',
|
|
bounding_box=[],
|
|
)
|
|
close_tag = Line(text="</page>", bounding_box=[])
|
|
ocr_page.lines = [open_tag, *ocr_page.lines, close_tag]
|
|
|
|
response_ix.ocr_result = ocr_result
|
|
|
|
# Build SegmentIndex only when provenance is on. Segment IDs
|
|
# deliberately skip page-tag lines (see SegmentIndex.build).
|
|
if request_ix.options.provenance.include_provenance:
|
|
seg_idx = SegmentIndex.build(
|
|
ocr_result=ocr_result,
|
|
granularity="line",
|
|
pages_metadata=page_metadata,
|
|
)
|
|
ctx.segment_index = seg_idx
|
|
|
|
return response_ix
|
|
|
|
|
|
__all__ = ["OCRStep"]
|