Three layered modules the SetupStep will wire together in Task 2.4. - fetch.py: async httpx fetch with configurable timeouts + incremental size cap (stream=True, accumulate bytes, raise IX_000_007 when exceeded). file:// URLs read locally. Auth headers pass through. The caller injects a FetchConfig — env reads happen in ix.config (Chunk 3). - mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset + require_supported(mime) helper that raises IX_000_005. - pages.py: DocumentIngestor.build_pages(files, texts) -> (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield multiple Pages), texts as zero-dim Pages so GenAIStep can still cite them. 21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx mapping, timeout -> IX_000_007, size cap enforced globally + per-file, file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF, require_supported gate, PDF/TIFF/text page counts, 101-page PDF -> IX_000_006, multi-file file_index assignment. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
"""Tests for DocumentIngestor.build_pages (spec §6.1)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from ix.errors import IXErrorCode, IXException
|
|
from ix.ingestion import DocumentIngestor
|
|
|
|
|
|
def _make_pdf_bytes(n_pages: int) -> bytes:
|
|
import fitz
|
|
|
|
doc = fitz.open()
|
|
for i in range(n_pages):
|
|
page = doc.new_page(width=200, height=300)
|
|
page.insert_text((10, 20), f"page {i+1}")
|
|
out = doc.tobytes()
|
|
doc.close()
|
|
return out
|
|
|
|
|
|
def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes:
|
|
from PIL import Image
|
|
|
|
frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)]
|
|
buf = BytesIO()
|
|
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
|
|
return buf.getvalue()
|
|
|
|
|
|
class TestPdf:
|
|
def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None:
|
|
p = tmp_path / "doc.pdf"
|
|
p.write_bytes(_make_pdf_bytes(3))
|
|
|
|
ing = DocumentIngestor()
|
|
pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[])
|
|
|
|
assert len(pages) == 3
|
|
for i, page in enumerate(pages, start=1):
|
|
assert page.page_no == i
|
|
assert page.width > 0
|
|
assert page.height > 0
|
|
assert len(metas) == 3
|
|
for m in metas:
|
|
assert m.file_index == 0
|
|
|
|
def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None:
|
|
p = tmp_path / "toomany.pdf"
|
|
p.write_bytes(_make_pdf_bytes(101))
|
|
ing = DocumentIngestor()
|
|
with pytest.raises(IXException) as ei:
|
|
ing.build_pages(files=[(p, "application/pdf")], texts=[])
|
|
assert ei.value.code is IXErrorCode.IX_000_006
|
|
|
|
|
|
class TestImages:
|
|
def test_single_frame_png(self, tmp_path: Path) -> None:
|
|
from PIL import Image
|
|
|
|
p = tmp_path / "img.png"
|
|
Image.new("RGB", (50, 80), color="white").save(p, format="PNG")
|
|
ing = DocumentIngestor()
|
|
pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[])
|
|
assert len(pages) == 1
|
|
assert pages[0].width == 50
|
|
assert pages[0].height == 80
|
|
assert metas[0].file_index == 0
|
|
|
|
def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None:
|
|
p = tmp_path / "multi.tif"
|
|
p.write_bytes(_make_multi_frame_tiff_bytes(2))
|
|
|
|
ing = DocumentIngestor()
|
|
pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[])
|
|
assert len(pages) == 2
|
|
for page in pages:
|
|
assert page.width == 10
|
|
assert page.height == 10
|
|
# Both frames share the same file_index.
|
|
assert {m.file_index for m in metas} == {0}
|
|
|
|
|
|
class TestTexts:
|
|
def test_texts_become_pages(self) -> None:
|
|
ing = DocumentIngestor()
|
|
pages, metas = ing.build_pages(files=[], texts=["hello", "world"])
|
|
assert len(pages) == 2
|
|
assert pages[0].page_no == 1
|
|
assert pages[1].page_no == 2
|
|
# Text-backed pages have no file_index source.
|
|
assert metas[0].file_index is None
|
|
assert metas[1].file_index is None
|
|
|
|
|
|
class TestFileIndexes:
|
|
def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None:
|
|
p1 = tmp_path / "a.pdf"
|
|
p1.write_bytes(_make_pdf_bytes(2))
|
|
p2 = tmp_path / "b.pdf"
|
|
p2.write_bytes(_make_pdf_bytes(1))
|
|
|
|
ing = DocumentIngestor()
|
|
pages, metas = ing.build_pages(
|
|
files=[(p1, "application/pdf"), (p2, "application/pdf")],
|
|
texts=[],
|
|
)
|
|
assert len(pages) == 3
|
|
# First two pages from file 0, last from file 1.
|
|
assert metas[0].file_index == 0
|
|
assert metas[1].file_index == 0
|
|
assert metas[2].file_index == 1
|