"""ResponseHandlerStep — final shape-up before the caller sees the payload (spec §8). Does three purely mechanical things: 1. When ``include_ocr_text`` is set, concatenate every non-tag line text into ``ocr_result.result.text`` (pages joined with blank line). 2. When ``include_geometries`` is **not** set (the default), strip ``ocr_result.result.pages`` and ``ocr_result.meta_data`` — geometries are heavyweight; callers opt in. 3. Clear ``response_ix.context`` (belt-and-braces — ``Field(exclude=True)`` already keeps it out of ``model_dump`` output). :meth:`validate` always returns True per spec. """ from __future__ import annotations import re from ix.contracts import RequestIX, ResponseIX from ix.pipeline.step import Step _PAGE_TAG_RE = re.compile(r"^\s*<\s*/?\s*page\b", re.IGNORECASE) def _is_page_tag(text: str | None) -> bool: if not text: return False return bool(_PAGE_TAG_RE.match(text)) class ResponseHandlerStep(Step): """Final shape-up step.""" async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool: return True async def process( self, request_ix: RequestIX, response_ix: ResponseIX ) -> ResponseIX: ocr_opts = request_ix.options.ocr # 1. Attach flat OCR text if requested. if ocr_opts.include_ocr_text: page_texts: list[str] = [] for page in response_ix.ocr_result.result.pages: line_texts = [ line.text or "" for line in page.lines if not _is_page_tag(line.text) ] page_texts.append("\n".join(line_texts)) response_ix.ocr_result.result.text = "\n\n".join(page_texts) or None # 2. Strip geometries unless explicitly retained. if not ocr_opts.include_geometries: response_ix.ocr_result.result.pages = [] response_ix.ocr_result.meta_data = {} # 3. Drop the internal context — already Field(exclude=True), # this is defense in depth. response_ix.context = None return response_ix __all__ = ["ResponseHandlerStep"]