Final pipeline step. Three mechanical transforms: 1. include_ocr_text -> concatenate non-tag line texts, pages joined with \n\n, write to ocr_result.result.text. 2. include_geometries=False (default) -> strip ocr_result.result.pages + ocr_result.meta_data. Geometries are heavy; callers opt in. 3. Delete response.context so the internal accumulator never leaks to the caller (belt-and-braces; Field(exclude=True) already does this). validate() always returns True per spec. 7 unit tests in tests/unit/test_response_handler_step.py cover all three branches + context-not-in-model_dump check. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
136 lines
4.5 KiB
Python
136 lines
4.5 KiB
Python
"""Tests for :class:`ix.pipeline.response_handler_step.ResponseHandlerStep` (spec §8)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ix.contracts import (
|
|
Context,
|
|
Line,
|
|
OCRDetails,
|
|
OCROptions,
|
|
OCRResult,
|
|
Options,
|
|
Page,
|
|
RequestIX,
|
|
ResponseIX,
|
|
)
|
|
from ix.contracts.response import _InternalContext
|
|
from ix.pipeline.response_handler_step import ResponseHandlerStep
|
|
|
|
|
|
def _make_request(
|
|
*,
|
|
include_geometries: bool = False,
|
|
include_ocr_text: bool = False,
|
|
) -> RequestIX:
|
|
return RequestIX(
|
|
use_case="bank_statement_header",
|
|
ix_client_id="test",
|
|
request_id="r-1",
|
|
context=Context(files=[], texts=[]),
|
|
options=Options(
|
|
ocr=OCROptions(
|
|
include_geometries=include_geometries,
|
|
include_ocr_text=include_ocr_text,
|
|
)
|
|
),
|
|
)
|
|
|
|
|
|
def _populated_response() -> ResponseIX:
|
|
resp = ResponseIX(
|
|
ocr_result=OCRResult(
|
|
result=OCRDetails(
|
|
text=None,
|
|
pages=[
|
|
Page(
|
|
page_no=1,
|
|
width=100.0,
|
|
height=200.0,
|
|
lines=[
|
|
Line(text='<page file="0" number="1">', bounding_box=[]),
|
|
Line(text="hello", bounding_box=[0, 0, 1, 0, 1, 1, 0, 1]),
|
|
Line(text="world", bounding_box=[0, 2, 1, 2, 1, 3, 0, 3]),
|
|
Line(text="</page>", bounding_box=[]),
|
|
],
|
|
),
|
|
Page(
|
|
page_no=2,
|
|
width=100.0,
|
|
height=200.0,
|
|
lines=[
|
|
Line(text="p2 line", bounding_box=[0, 0, 1, 0, 1, 1, 0, 1]),
|
|
],
|
|
),
|
|
],
|
|
),
|
|
meta_data={"adapter": "fake"},
|
|
)
|
|
)
|
|
resp.context = _InternalContext()
|
|
return resp
|
|
|
|
|
|
class TestValidateAlwaysTrue:
|
|
async def test_validate_always_true(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request()
|
|
assert await step.validate(req, _populated_response()) is True
|
|
|
|
|
|
class TestAttachOcrText:
|
|
async def test_include_ocr_text_concatenates_lines(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request(include_ocr_text=True, include_geometries=True)
|
|
resp = _populated_response()
|
|
resp = await step.process(req, resp)
|
|
# Page tag lines excluded; real lines joined within page with \n,
|
|
# pages with \n\n.
|
|
text = resp.ocr_result.result.text
|
|
assert text is not None
|
|
assert "hello\nworld" in text
|
|
assert "p2 line" in text
|
|
assert "<page" not in text
|
|
|
|
async def test_include_ocr_text_false_leaves_text_alone(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request(include_ocr_text=False, include_geometries=True)
|
|
resp = _populated_response()
|
|
resp.ocr_result.result.text = None
|
|
resp = await step.process(req, resp)
|
|
assert resp.ocr_result.result.text is None
|
|
|
|
|
|
class TestStripGeometries:
|
|
async def test_strips_pages_and_meta_when_off(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request(include_geometries=False)
|
|
resp = _populated_response()
|
|
resp = await step.process(req, resp)
|
|
assert resp.ocr_result.result.pages == []
|
|
assert resp.ocr_result.meta_data == {}
|
|
|
|
async def test_keeps_pages_when_on(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request(include_geometries=True)
|
|
resp = _populated_response()
|
|
pages_before = [p.page_no for p in resp.ocr_result.result.pages]
|
|
resp = await step.process(req, resp)
|
|
assert [p.page_no for p in resp.ocr_result.result.pages] == pages_before
|
|
assert resp.ocr_result.meta_data == {"adapter": "fake"}
|
|
|
|
|
|
class TestContextDeletion:
|
|
async def test_context_removed(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request()
|
|
resp = _populated_response()
|
|
resp = await step.process(req, resp)
|
|
assert resp.context is None
|
|
|
|
async def test_context_not_in_model_dump(self) -> None:
|
|
step = ResponseHandlerStep()
|
|
req = _make_request()
|
|
resp = _populated_response()
|
|
resp = await step.process(req, resp)
|
|
dump = resp.model_dump()
|
|
assert "context" not in dump
|