infoxtractor/tests/unit/test_ingestion_pages.py
Dirk Riemann 290e51416f
All checks were successful
tests / test (push) Successful in 57s
tests / test (pull_request) Successful in 1m12s
feat(ingestion): fetch_file + MIME sniff + DocumentIngestor (spec §6.1)
Three layered modules the SetupStep will wire together in Task 2.4.

- fetch.py: async httpx fetch with configurable timeouts + incremental
  size cap (stream=True, accumulate bytes, raise IX_000_007 when
  exceeded). file:// URLs read locally. Auth headers pass through. The
  caller injects a FetchConfig — env reads happen in ix.config (Chunk 3).
- mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset +
  require_supported(mime) helper that raises IX_000_005.
- pages.py: DocumentIngestor.build_pages(files, texts) ->
  (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF
  cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield
  multiple Pages), texts as zero-dim Pages so GenAIStep can still cite
  them.

21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx
mapping, timeout -> IX_000_007, size cap enforced globally + per-file,
file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF,
require_supported gate, PDF/TIFF/text page counts, 101-page PDF ->
IX_000_006, multi-file file_index assignment.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 11:12:00 +02:00

116 lines
3.7 KiB
Python

"""Tests for DocumentIngestor.build_pages (spec §6.1)."""
from __future__ import annotations
from io import BytesIO
from pathlib import Path
import pytest
from ix.errors import IXErrorCode, IXException
from ix.ingestion import DocumentIngestor
def _make_pdf_bytes(n_pages: int) -> bytes:
import fitz
doc = fitz.open()
for i in range(n_pages):
page = doc.new_page(width=200, height=300)
page.insert_text((10, 20), f"page {i+1}")
out = doc.tobytes()
doc.close()
return out
def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes:
from PIL import Image
frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)]
buf = BytesIO()
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
return buf.getvalue()
class TestPdf:
def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None:
p = tmp_path / "doc.pdf"
p.write_bytes(_make_pdf_bytes(3))
ing = DocumentIngestor()
pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[])
assert len(pages) == 3
for i, page in enumerate(pages, start=1):
assert page.page_no == i
assert page.width > 0
assert page.height > 0
assert len(metas) == 3
for m in metas:
assert m.file_index == 0
def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None:
p = tmp_path / "toomany.pdf"
p.write_bytes(_make_pdf_bytes(101))
ing = DocumentIngestor()
with pytest.raises(IXException) as ei:
ing.build_pages(files=[(p, "application/pdf")], texts=[])
assert ei.value.code is IXErrorCode.IX_000_006
class TestImages:
def test_single_frame_png(self, tmp_path: Path) -> None:
from PIL import Image
p = tmp_path / "img.png"
Image.new("RGB", (50, 80), color="white").save(p, format="PNG")
ing = DocumentIngestor()
pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[])
assert len(pages) == 1
assert pages[0].width == 50
assert pages[0].height == 80
assert metas[0].file_index == 0
def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None:
p = tmp_path / "multi.tif"
p.write_bytes(_make_multi_frame_tiff_bytes(2))
ing = DocumentIngestor()
pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[])
assert len(pages) == 2
for page in pages:
assert page.width == 10
assert page.height == 10
# Both frames share the same file_index.
assert {m.file_index for m in metas} == {0}
class TestTexts:
def test_texts_become_pages(self) -> None:
ing = DocumentIngestor()
pages, metas = ing.build_pages(files=[], texts=["hello", "world"])
assert len(pages) == 2
assert pages[0].page_no == 1
assert pages[1].page_no == 2
# Text-backed pages have no file_index source.
assert metas[0].file_index is None
assert metas[1].file_index is None
class TestFileIndexes:
def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None:
p1 = tmp_path / "a.pdf"
p1.write_bytes(_make_pdf_bytes(2))
p2 = tmp_path / "b.pdf"
p2.write_bytes(_make_pdf_bytes(1))
ing = DocumentIngestor()
pages, metas = ing.build_pages(
files=[(p1, "application/pdf"), (p2, "application/pdf")],
texts=[],
)
assert len(pages) == 3
# First two pages from file 0, last from file 1.
assert metas[0].file_index == 0
assert metas[1].file_index == 0
assert metas[2].file_index == 1