Builds the ID <-> on-page-anchor map used by both the GenAIStep (to emit the segment-tagged user message) and the provenance mapper (to resolve LLM-cited IDs back to bbox/text/file_index). Design notes: - `build()` is a classmethod so the pipeline constructs the index in one place (OCRStep) and passes the constructed instance along in the internal context. No mutable global state; tests build indexes inline from fake OCR fixtures. - Per-page metadata (file_index) arrives via a parallel `list[PageMetadata]` rather than being smuggled into OCRResult. Keeps segmentation decoupled from ingestion — the OCR engine legitimately doesn't know which file a page came from. - Page-tag lines (`<page …>` / `</page>`) are filtered via a regex so the LLM can never cite them as provenance. `line_idx_in_page` increments only for real lines so the IDs stay dense (p1_l0, p1_l1, ...). - Bounding-box normalisation divides x-coords by page width, y-coords by page height. Zero dimensions (defensive) pass through unchanged. - `to_prompt_text(context_texts=[...])` appends paperless-style texts untagged, separated from the tagged body by a blank line (spec §7.2b). Deterministic for prompt caching. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
190 lines
6.4 KiB
Python
190 lines
6.4 KiB
Python
"""Tests for the SegmentIndex — spec §9.1."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from ix.contracts import Line, OCRDetails, OCRResult, Page
|
|
from ix.segmentation import PageMetadata, SegmentIndex
|
|
|
|
|
|
def _make_pages_metadata(n: int, file_index: int = 0) -> list[PageMetadata]:
|
|
"""Build ``n`` flat-list page entries carrying only file_index."""
|
|
return [PageMetadata(file_index=file_index) for _ in range(n)]
|
|
|
|
|
|
def _line(text: str, bbox: list[float]) -> Line:
|
|
return Line(text=text, bounding_box=bbox)
|
|
|
|
|
|
def _page(page_no: int, width: float, height: float, lines: list[Line]) -> Page:
|
|
return Page(page_no=page_no, width=width, height=height, lines=lines)
|
|
|
|
|
|
class TestBuild:
|
|
def test_ids_per_page(self) -> None:
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(1, 100.0, 200.0, [_line("hello", [0, 0, 10, 0, 10, 20, 0, 20])]),
|
|
_page(
|
|
2,
|
|
100.0,
|
|
200.0,
|
|
[
|
|
_line("foo", [0, 0, 10, 0, 10, 20, 0, 20]),
|
|
_line("bar", [0, 30, 10, 30, 10, 50, 0, 50]),
|
|
],
|
|
),
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=_make_pages_metadata(2),
|
|
)
|
|
|
|
assert idx._ordered_ids == ["p1_l0", "p2_l0", "p2_l1"]
|
|
|
|
pos = idx.lookup_segment("p1_l0")
|
|
assert pos is not None
|
|
assert pos["page"] == 1
|
|
assert pos["text"] == "hello"
|
|
assert pos["file_index"] == 0
|
|
|
|
def test_page_tag_lines_excluded(self) -> None:
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(
|
|
1,
|
|
100.0,
|
|
200.0,
|
|
[
|
|
_line('<page file="0" number="1">', [0, 0, 10, 0, 10, 5, 0, 5]),
|
|
_line("first real line", [0, 10, 10, 10, 10, 20, 0, 20]),
|
|
_line("</page>", [0, 25, 10, 25, 10, 30, 0, 30]),
|
|
],
|
|
)
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=_make_pages_metadata(1),
|
|
)
|
|
assert idx._ordered_ids == ["p1_l0"]
|
|
assert idx.lookup_segment("p1_l0")["text"] == "first real line" # type: ignore[index]
|
|
|
|
def test_lookup_unknown_returns_none(self) -> None:
|
|
idx = SegmentIndex.build(
|
|
ocr_result=OCRResult(result=OCRDetails(pages=[])),
|
|
granularity="line",
|
|
pages_metadata=[],
|
|
)
|
|
assert idx.lookup_segment("pX_l99") is None
|
|
|
|
|
|
class TestBboxNormalization:
|
|
def test_divides_by_page_width_and_height(self) -> None:
|
|
# x-coords get /width, y-coords get /height.
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(
|
|
1,
|
|
width=200.0,
|
|
height=400.0,
|
|
lines=[_line("x", [50, 100, 150, 100, 150, 300, 50, 300])],
|
|
)
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=_make_pages_metadata(1),
|
|
)
|
|
pos = idx.lookup_segment("p1_l0")
|
|
assert pos is not None
|
|
bbox = pos["bbox"]
|
|
# Compare with a bit of float slack.
|
|
assert bbox.coordinates == [0.25, 0.25, 0.75, 0.25, 0.75, 0.75, 0.25, 0.75]
|
|
|
|
|
|
class TestPromptFormat:
|
|
def test_tagged_lines_and_untagged_texts_appended(self) -> None:
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(
|
|
1,
|
|
100.0,
|
|
200.0,
|
|
[
|
|
_line("line one", [0, 0, 10, 0, 10, 5, 0, 5]),
|
|
_line("line two", [0, 10, 10, 10, 10, 15, 0, 15]),
|
|
],
|
|
),
|
|
_page(2, 100.0, 200.0, [_line("line A", [0, 0, 10, 0, 10, 5, 0, 5])]),
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=_make_pages_metadata(2),
|
|
)
|
|
text = idx.to_prompt_text(context_texts=["extra paperless text", "another"])
|
|
|
|
lines = text.split("\n")
|
|
# Tagged OCR lines first, in insertion order.
|
|
assert lines[0] == "[p1_l0] line one"
|
|
assert lines[1] == "[p1_l1] line two"
|
|
assert lines[2] == "[p2_l0] line A"
|
|
# The extra texts are appended untagged.
|
|
assert "extra paperless text" in text
|
|
assert "another" in text
|
|
# Sanity: the pageless texts should appear AFTER the last tagged line.
|
|
p2_idx = text.index("[p2_l0]")
|
|
extra_idx = text.index("extra paperless text")
|
|
assert extra_idx > p2_idx
|
|
|
|
def test_prompt_text_without_extra_texts(self) -> None:
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(1, 100.0, 200.0, [_line("only", [0, 0, 10, 0, 10, 5, 0, 5])]),
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=_make_pages_metadata(1),
|
|
)
|
|
text = idx.to_prompt_text(context_texts=[])
|
|
assert text.strip() == "[p1_l0] only"
|
|
|
|
|
|
class TestFileIndexPassthrough:
|
|
def test_file_index_from_metadata(self) -> None:
|
|
pages_meta = [
|
|
PageMetadata(file_index=0),
|
|
PageMetadata(file_index=1),
|
|
]
|
|
ocr = OCRResult(
|
|
result=OCRDetails(
|
|
pages=[
|
|
_page(1, 100.0, 200.0, [_line("a", [0, 0, 10, 0, 10, 5, 0, 5])]),
|
|
_page(1, 100.0, 200.0, [_line("b", [0, 0, 10, 0, 10, 5, 0, 5])]),
|
|
]
|
|
)
|
|
)
|
|
idx = SegmentIndex.build(
|
|
ocr_result=ocr,
|
|
granularity="line",
|
|
pages_metadata=pages_meta,
|
|
)
|
|
assert idx.lookup_segment("p1_l0")["file_index"] == 0 # type: ignore[index]
|
|
assert idx.lookup_segment("p2_l0")["file_index"] == 1 # type: ignore[index]
|