Final pipeline step. Three mechanical transforms: 1. include_ocr_text -> concatenate non-tag line texts, pages joined with \n\n, write to ocr_result.result.text. 2. include_geometries=False (default) -> strip ocr_result.result.pages + ocr_result.meta_data. Geometries are heavy; callers opt in. 3. Delete response.context so the internal accumulator never leaks to the caller (belt-and-braces; Field(exclude=True) already does this). validate() always returns True per spec. 7 unit tests in tests/unit/test_response_handler_step.py cover all three branches + context-not-in-model_dump check. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
"""ResponseHandlerStep — final shape-up before the caller sees the payload (spec §8).
|
|
|
|
Does three purely mechanical things:
|
|
|
|
1. When ``include_ocr_text`` is set, concatenate every non-tag line text
|
|
into ``ocr_result.result.text`` (pages joined with blank line).
|
|
2. When ``include_geometries`` is **not** set (the default), strip
|
|
``ocr_result.result.pages`` and ``ocr_result.meta_data`` — geometries
|
|
are heavyweight; callers opt in.
|
|
3. Clear ``response_ix.context`` (belt-and-braces — ``Field(exclude=True)``
|
|
already keeps it out of ``model_dump`` output).
|
|
|
|
:meth:`validate` always returns True per spec.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from ix.contracts import RequestIX, ResponseIX
|
|
from ix.pipeline.step import Step
|
|
|
|
_PAGE_TAG_RE = re.compile(r"^\s*<\s*/?\s*page\b", re.IGNORECASE)
|
|
|
|
|
|
def _is_page_tag(text: str | None) -> bool:
|
|
if not text:
|
|
return False
|
|
return bool(_PAGE_TAG_RE.match(text))
|
|
|
|
|
|
class ResponseHandlerStep(Step):
|
|
"""Final shape-up step."""
|
|
|
|
async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool:
|
|
return True
|
|
|
|
async def process(
|
|
self, request_ix: RequestIX, response_ix: ResponseIX
|
|
) -> ResponseIX:
|
|
ocr_opts = request_ix.options.ocr
|
|
|
|
# 1. Attach flat OCR text if requested.
|
|
if ocr_opts.include_ocr_text:
|
|
page_texts: list[str] = []
|
|
for page in response_ix.ocr_result.result.pages:
|
|
line_texts = [
|
|
line.text or ""
|
|
for line in page.lines
|
|
if not _is_page_tag(line.text)
|
|
]
|
|
page_texts.append("\n".join(line_texts))
|
|
response_ix.ocr_result.result.text = "\n\n".join(page_texts) or None
|
|
|
|
# 2. Strip geometries unless explicitly retained.
|
|
if not ocr_opts.include_geometries:
|
|
response_ix.ocr_result.result.pages = []
|
|
response_ix.ocr_result.meta_data = {}
|
|
|
|
# 3. Drop the internal context — already Field(exclude=True),
|
|
# this is defense in depth.
|
|
response_ix.context = None
|
|
|
|
return response_ix
|
|
|
|
|
|
__all__ = ["ResponseHandlerStep"]
|