"""Tests for the SegmentIndex — spec §9.1.""" from __future__ import annotations from ix.contracts import Line, OCRDetails, OCRResult, Page from ix.segmentation import PageMetadata, SegmentIndex def _make_pages_metadata(n: int, file_index: int = 0) -> list[PageMetadata]: """Build ``n`` flat-list page entries carrying only file_index.""" return [PageMetadata(file_index=file_index) for _ in range(n)] def _line(text: str, bbox: list[float]) -> Line: return Line(text=text, bounding_box=bbox) def _page(page_no: int, width: float, height: float, lines: list[Line]) -> Page: return Page(page_no=page_no, width=width, height=height, lines=lines) class TestBuild: def test_ids_per_page(self) -> None: ocr = OCRResult( result=OCRDetails( pages=[ _page(1, 100.0, 200.0, [_line("hello", [0, 0, 10, 0, 10, 20, 0, 20])]), _page( 2, 100.0, 200.0, [ _line("foo", [0, 0, 10, 0, 10, 20, 0, 20]), _line("bar", [0, 30, 10, 30, 10, 50, 0, 50]), ], ), ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=_make_pages_metadata(2), ) assert idx._ordered_ids == ["p1_l0", "p2_l0", "p2_l1"] pos = idx.lookup_segment("p1_l0") assert pos is not None assert pos["page"] == 1 assert pos["text"] == "hello" assert pos["file_index"] == 0 def test_page_tag_lines_excluded(self) -> None: ocr = OCRResult( result=OCRDetails( pages=[ _page( 1, 100.0, 200.0, [ _line('', [0, 0, 10, 0, 10, 5, 0, 5]), _line("first real line", [0, 10, 10, 10, 10, 20, 0, 20]), _line("", [0, 25, 10, 25, 10, 30, 0, 30]), ], ) ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=_make_pages_metadata(1), ) assert idx._ordered_ids == ["p1_l0"] assert idx.lookup_segment("p1_l0")["text"] == "first real line" # type: ignore[index] def test_lookup_unknown_returns_none(self) -> None: idx = SegmentIndex.build( ocr_result=OCRResult(result=OCRDetails(pages=[])), granularity="line", pages_metadata=[], ) assert idx.lookup_segment("pX_l99") is None class TestBboxNormalization: def test_divides_by_page_width_and_height(self) -> None: # x-coords get /width, y-coords get /height. ocr = OCRResult( result=OCRDetails( pages=[ _page( 1, width=200.0, height=400.0, lines=[_line("x", [50, 100, 150, 100, 150, 300, 50, 300])], ) ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=_make_pages_metadata(1), ) pos = idx.lookup_segment("p1_l0") assert pos is not None bbox = pos["bbox"] # Compare with a bit of float slack. assert bbox.coordinates == [0.25, 0.25, 0.75, 0.25, 0.75, 0.75, 0.25, 0.75] class TestPromptFormat: def test_tagged_lines_and_untagged_texts_appended(self) -> None: ocr = OCRResult( result=OCRDetails( pages=[ _page( 1, 100.0, 200.0, [ _line("line one", [0, 0, 10, 0, 10, 5, 0, 5]), _line("line two", [0, 10, 10, 10, 10, 15, 0, 15]), ], ), _page(2, 100.0, 200.0, [_line("line A", [0, 0, 10, 0, 10, 5, 0, 5])]), ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=_make_pages_metadata(2), ) text = idx.to_prompt_text(context_texts=["extra paperless text", "another"]) lines = text.split("\n") # Tagged OCR lines first, in insertion order. assert lines[0] == "[p1_l0] line one" assert lines[1] == "[p1_l1] line two" assert lines[2] == "[p2_l0] line A" # The extra texts are appended untagged. assert "extra paperless text" in text assert "another" in text # Sanity: the pageless texts should appear AFTER the last tagged line. p2_idx = text.index("[p2_l0]") extra_idx = text.index("extra paperless text") assert extra_idx > p2_idx def test_prompt_text_without_extra_texts(self) -> None: ocr = OCRResult( result=OCRDetails( pages=[ _page(1, 100.0, 200.0, [_line("only", [0, 0, 10, 0, 10, 5, 0, 5])]), ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=_make_pages_metadata(1), ) text = idx.to_prompt_text(context_texts=[]) assert text.strip() == "[p1_l0] only" class TestFileIndexPassthrough: def test_file_index_from_metadata(self) -> None: pages_meta = [ PageMetadata(file_index=0), PageMetadata(file_index=1), ] ocr = OCRResult( result=OCRDetails( pages=[ _page(1, 100.0, 200.0, [_line("a", [0, 0, 10, 0, 10, 5, 0, 5])]), _page(1, 100.0, 200.0, [_line("b", [0, 0, 10, 0, 10, 5, 0, 5])]), ] ) ) idx = SegmentIndex.build( ocr_result=ocr, granularity="line", pages_metadata=pages_meta, ) assert idx.lookup_segment("p1_l0")["file_index"] == 0 # type: ignore[index] assert idx.lookup_segment("p2_l0")["file_index"] == 1 # type: ignore[index]