infoxtractor/tests/unit/test_segment_index.py

"""Tests for the SegmentIndex — spec §9.1."""

from __future__ import annotations

from ix.contracts import Line, OCRDetails, OCRResult, Page
from ix.segmentation import PageMetadata, SegmentIndex


def _make_pages_metadata(n: int, file_index: int = 0) -> list[PageMetadata]:
    """Build ``n`` flat-list page entries carrying only file_index."""
    return [PageMetadata(file_index=file_index) for _ in range(n)]


def _line(text: str, bbox: list[float]) -> Line:
    return Line(text=text, bounding_box=bbox)


def _page(page_no: int, width: float, height: float, lines: list[Line]) -> Page:
    return Page(page_no=page_no, width=width, height=height, lines=lines)


class TestBuild:
    def test_ids_per_page(self) -> None:
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(1, 100.0, 200.0, [_line("hello", [0, 0, 10, 0, 10, 20, 0, 20])]),
                    _page(
                        2,
                        100.0,
                        200.0,
                        [
                            _line("foo", [0, 0, 10, 0, 10, 20, 0, 20]),
                            _line("bar", [0, 30, 10, 30, 10, 50, 0, 50]),
                        ],
                    ),
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=_make_pages_metadata(2),
        )

        assert idx._ordered_ids == ["p1_l0", "p2_l0", "p2_l1"]

        pos = idx.lookup_segment("p1_l0")
        assert pos is not None
        assert pos["page"] == 1
        assert pos["text"] == "hello"
        assert pos["file_index"] == 0

    def test_page_tag_lines_excluded(self) -> None:
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(
                        1,
                        100.0,
                        200.0,
                        [
                            _line('<page file="0" number="1">', [0, 0, 10, 0, 10, 5, 0, 5]),
                            _line("first real line", [0, 10, 10, 10, 10, 20, 0, 20]),
                            _line("</page>", [0, 25, 10, 25, 10, 30, 0, 30]),
                        ],
                    )
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=_make_pages_metadata(1),
        )
        assert idx._ordered_ids == ["p1_l0"]
        assert idx.lookup_segment("p1_l0")["text"] == "first real line"  # type: ignore[index]

    def test_lookup_unknown_returns_none(self) -> None:
        idx = SegmentIndex.build(
            ocr_result=OCRResult(result=OCRDetails(pages=[])),
            granularity="line",
            pages_metadata=[],
        )
        assert idx.lookup_segment("pX_l99") is None


class TestBboxNormalization:
    def test_divides_by_page_width_and_height(self) -> None:
        # x-coords get /width, y-coords get /height.
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(
                        1,
                        width=200.0,
                        height=400.0,
                        lines=[_line("x", [50, 100, 150, 100, 150, 300, 50, 300])],
                    )
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=_make_pages_metadata(1),
        )
        pos = idx.lookup_segment("p1_l0")
        assert pos is not None
        bbox = pos["bbox"]
        # Compare with a bit of float slack.
        assert bbox.coordinates == [0.25, 0.25, 0.75, 0.25, 0.75, 0.75, 0.25, 0.75]


class TestPromptFormat:
    def test_tagged_lines_and_untagged_texts_appended(self) -> None:
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(
                        1,
                        100.0,
                        200.0,
                        [
                            _line("line one", [0, 0, 10, 0, 10, 5, 0, 5]),
                            _line("line two", [0, 10, 10, 10, 10, 15, 0, 15]),
                        ],
                    ),
                    _page(2, 100.0, 200.0, [_line("line A", [0, 0, 10, 0, 10, 5, 0, 5])]),
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=_make_pages_metadata(2),
        )
        text = idx.to_prompt_text(context_texts=["extra paperless text", "another"])

        lines = text.split("\n")
        # Tagged OCR lines first, in insertion order.
        assert lines[0] == "[p1_l0] line one"
        assert lines[1] == "[p1_l1] line two"
        assert lines[2] == "[p2_l0] line A"
        # The extra texts are appended untagged.
        assert "extra paperless text" in text
        assert "another" in text
        # Sanity: the pageless texts should appear AFTER the last tagged line.
        p2_idx = text.index("[p2_l0]")
        extra_idx = text.index("extra paperless text")
        assert extra_idx > p2_idx

    def test_prompt_text_without_extra_texts(self) -> None:
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(1, 100.0, 200.0, [_line("only", [0, 0, 10, 0, 10, 5, 0, 5])]),
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=_make_pages_metadata(1),
        )
        text = idx.to_prompt_text(context_texts=[])
        assert text.strip() == "[p1_l0] only"


class TestFileIndexPassthrough:
    def test_file_index_from_metadata(self) -> None:
        pages_meta = [
            PageMetadata(file_index=0),
            PageMetadata(file_index=1),
        ]
        ocr = OCRResult(
            result=OCRDetails(
                pages=[
                    _page(1, 100.0, 200.0, [_line("a", [0, 0, 10, 0, 10, 5, 0, 5])]),
                    _page(1, 100.0, 200.0, [_line("b", [0, 0, 10, 0, 10, 5, 0, 5])]),
                ]
            )
        )
        idx = SegmentIndex.build(
            ocr_result=ocr,
            granularity="line",
            pages_metadata=pages_meta,
        )
        assert idx.lookup_segment("p1_l0")["file_index"] == 0  # type: ignore[index]
        assert idx.lookup_segment("p2_l0")["file_index"] == 1  # type: ignore[index]