"""Tests for DocumentIngestor.build_pages (spec ยง6.1).""" from __future__ import annotations from io import BytesIO from pathlib import Path import pytest from ix.errors import IXErrorCode, IXException from ix.ingestion import DocumentIngestor def _make_pdf_bytes(n_pages: int) -> bytes: import fitz doc = fitz.open() for i in range(n_pages): page = doc.new_page(width=200, height=300) page.insert_text((10, 20), f"page {i+1}") out = doc.tobytes() doc.close() return out def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes: from PIL import Image frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)] buf = BytesIO() frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:]) return buf.getvalue() class TestPdf: def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None: p = tmp_path / "doc.pdf" p.write_bytes(_make_pdf_bytes(3)) ing = DocumentIngestor() pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[]) assert len(pages) == 3 for i, page in enumerate(pages, start=1): assert page.page_no == i assert page.width > 0 assert page.height > 0 assert len(metas) == 3 for m in metas: assert m.file_index == 0 def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None: p = tmp_path / "toomany.pdf" p.write_bytes(_make_pdf_bytes(101)) ing = DocumentIngestor() with pytest.raises(IXException) as ei: ing.build_pages(files=[(p, "application/pdf")], texts=[]) assert ei.value.code is IXErrorCode.IX_000_006 class TestImages: def test_single_frame_png(self, tmp_path: Path) -> None: from PIL import Image p = tmp_path / "img.png" Image.new("RGB", (50, 80), color="white").save(p, format="PNG") ing = DocumentIngestor() pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[]) assert len(pages) == 1 assert pages[0].width == 50 assert pages[0].height == 80 assert metas[0].file_index == 0 def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None: p = tmp_path / "multi.tif" p.write_bytes(_make_multi_frame_tiff_bytes(2)) ing = DocumentIngestor() pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[]) assert len(pages) == 2 for page in pages: assert page.width == 10 assert page.height == 10 # Both frames share the same file_index. assert {m.file_index for m in metas} == {0} class TestTexts: def test_texts_become_pages(self) -> None: ing = DocumentIngestor() pages, metas = ing.build_pages(files=[], texts=["hello", "world"]) assert len(pages) == 2 assert pages[0].page_no == 1 assert pages[1].page_no == 2 # Text-backed pages have no file_index source. assert metas[0].file_index is None assert metas[1].file_index is None class TestFileIndexes: def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None: p1 = tmp_path / "a.pdf" p1.write_bytes(_make_pdf_bytes(2)) p2 = tmp_path / "b.pdf" p2.write_bytes(_make_pdf_bytes(1)) ing = DocumentIngestor() pages, metas = ing.build_pages( files=[(p1, "application/pdf"), (p2, "application/pdf")], texts=[], ) assert len(pages) == 3 # First two pages from file 0, last from file 1. assert metas[0].file_index == 0 assert metas[1].file_index == 0 assert metas[2].file_index == 1