feat(ingestion): fetch_file + MIME sniff + DocumentIngestor (spec §6.1)

Three layered modules the SetupStep will wire together in Task 2.4. - fetch.py: async httpx fetch with configurable timeouts + incremental size cap (stream=True, accumulate bytes, raise IX_000_007 when exceeded). file:// URLs read locally. Auth headers pass through. The caller injects a FetchConfig — env reads happen in ix.config (Chunk 3). - mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset + require_supported(mime) helper that raises IX_000_005. - pages.py: DocumentIngestor.build_pages(files, texts) -> (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield multiple Pages), texts as zero-dim Pages so GenAIStep can still cite them. 21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx mapping, timeout -> IX_000_007, size cap enforced globally + per-file, file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF, require_supported gate, PDF/TIFF/text page counts, 101-page PDF -> IX_000_006, multi-file file_index assignment. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 11:12:00 +02:00 · 2026-04-18 11:12:00 +02:00 · 290e51416f
commit 290e51416f
parent 2709fb8d6b
7 changed files with 676 additions and 0 deletions
--- a/src/ix/ingestion/init.py
+++ b/src/ix/ingestion/init.py
@ -0,0 +1,27 @@
+"""Ingestion pipeline helpers: fetch → MIME-detect → build pages.
+
+Three modules layered bottom-up:
+
+* :mod:`ix.ingestion.fetch` — async HTTP(S) / ``file://`` downloader with
+  incremental size caps and pluggable timeouts.
+* :mod:`ix.ingestion.mime` — byte-sniffing MIME detection + the
+  MVP-supported MIME set.
+* :mod:`ix.ingestion.pages` — :class:`DocumentIngestor` that turns local
+  files + raw texts into the flat :class:`~ix.contracts.Page` list the
+  OCR step expects.
+"""
+
+from __future__ import annotations
+
+from ix.ingestion.fetch import FetchConfig, fetch_file
+from ix.ingestion.mime import SUPPORTED_MIMES, detect_mime, require_supported
+from ix.ingestion.pages import DocumentIngestor
+
+__all__ = [
+    "SUPPORTED_MIMES",
+    "DocumentIngestor",
+    "FetchConfig",
+    "detect_mime",
+    "fetch_file",
+    "require_supported",
+]
--- a/src/ix/ingestion/fetch.py
+++ b/src/ix/ingestion/fetch.py
@ -0,0 +1,144 @@
+"""Async file fetcher (spec §6.1).
+
+Supports ``http(s)://`` URLs (via httpx with configurable connect/read
+timeouts and an incremental size cap) and ``file://`` URLs (read from
+local fs — used by the E2E fixture). Auth headers on the :class:`FileRef`
+pass through unchanged.
+
+Every failure mode surfaces as :attr:`~ix.errors.IXErrorCode.IX_000_007`
+with the offending URL + cause in the ``detail`` slot so the caller log
+line is grep-friendly.
+
+Env-driven defaults live in :mod:`ix.config` (Chunk 3). The caller injects
+a :class:`FetchConfig` — this module is purely mechanical.
+"""
+
+from __future__ import annotations
+
+import urllib.parse
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+
+from ix.contracts import FileRef
+from ix.errors import IXErrorCode, IXException
+
+
+@dataclass(slots=True)
+class FetchConfig:
+    """Per-fetch knobs injected by the caller.
+
+    ``connect_timeout_s`` / ``read_timeout_s`` → httpx timeouts.
+    ``max_bytes`` is the pipeline-wide default cap; the per-file override
+    on :attr:`~ix.contracts.FileRef.max_bytes` wins when lower.
+    """
+
+    connect_timeout_s: float
+    read_timeout_s: float
+    max_bytes: int
+
+
+def _effective_cap(file_ref: FileRef, cfg: FetchConfig) -> int:
+    """The smaller of the pipeline-wide cap and the per-file override."""
+    if file_ref.max_bytes is None:
+        return cfg.max_bytes
+    return min(cfg.max_bytes, file_ref.max_bytes)
+
+
+def _safe_filename(url: str) -> str:
+    """Derive a readable filename for the scratch copy from the URL."""
+    parsed = urllib.parse.urlparse(url)
+    candidate = Path(parsed.path).name or "download"
+    # Strip anything that would escape the tmp dir.
+    return candidate.replace("/", "_").replace("\\", "_")
+
+
+async def _fetch_http(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None:
+    """HTTP(S) download with incremental size cap. Raises IX_000_007 on any failure."""
+    cap = _effective_cap(file_ref, cfg)
+    timeout = httpx.Timeout(
+        cfg.read_timeout_s,
+        connect=cfg.connect_timeout_s,
+    )
+    try:
+        async with (
+            httpx.AsyncClient(timeout=timeout) as client,
+            client.stream(
+                "GET",
+                file_ref.url,
+                headers=file_ref.headers or None,
+            ) as response,
+        ):
+            if response.status_code >= 300:
+                raise IXException(
+                    IXErrorCode.IX_000_007,
+                    detail=f"{file_ref.url}: HTTP {response.status_code}",
+                )
+            total = 0
+            with dst.open("wb") as fh:
+                async for chunk in response.aiter_bytes():
+                    total += len(chunk)
+                    if total > cap:
+                        raise IXException(
+                            IXErrorCode.IX_000_007,
+                            detail=f"{file_ref.url}: size cap {cap} bytes exceeded",
+                        )
+                    fh.write(chunk)
+    except IXException:
+        raise
+    except httpx.TimeoutException as exc:
+        raise IXException(
+            IXErrorCode.IX_000_007,
+            detail=f"{file_ref.url}: timeout ({exc.__class__.__name__})",
+        ) from exc
+    except httpx.HTTPError as exc:
+        raise IXException(
+            IXErrorCode.IX_000_007,
+            detail=f"{file_ref.url}: {exc.__class__.__name__}: {exc}",
+        ) from exc
+
+
+def _fetch_file_scheme(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None:
+    """Local-path read via ``file://`` URL. Same failure-mode contract."""
+    cap = _effective_cap(file_ref, cfg)
+    src_path = Path(urllib.parse.urlparse(file_ref.url).path)
+    if not src_path.exists():
+        raise IXException(
+            IXErrorCode.IX_000_007,
+            detail=f"{file_ref.url}: file does not exist",
+        )
+    size = src_path.stat().st_size
+    if size > cap:
+        raise IXException(
+            IXErrorCode.IX_000_007,
+            detail=f"{file_ref.url}: size cap {cap} bytes exceeded",
+        )
+    dst.write_bytes(src_path.read_bytes())
+
+
+async def fetch_file(file_ref: FileRef, tmp_dir: Path, cfg: FetchConfig) -> Path:
+    """Download / copy ``file_ref`` into ``tmp_dir`` and return the local path.
+
+    http(s) and file:// URLs both supported. Any fetch failure raises
+    :class:`~ix.errors.IXException` with
+    :attr:`~ix.errors.IXErrorCode.IX_000_007`.
+    """
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    scheme = urllib.parse.urlparse(file_ref.url).scheme.lower()
+    dst = tmp_dir / _safe_filename(file_ref.url)
+
+    if scheme in ("http", "https"):
+        await _fetch_http(file_ref, dst, cfg)
+    elif scheme == "file":
+        _fetch_file_scheme(file_ref, dst, cfg)
+    else:
+        raise IXException(
+            IXErrorCode.IX_000_007,
+            detail=f"{file_ref.url}: unsupported URL scheme {scheme!r}",
+        )
+
+    return dst
+
+
+__all__ = ["FetchConfig", "fetch_file"]
--- a/src/ix/ingestion/mime.py
+++ b/src/ix/ingestion/mime.py
@ -0,0 +1,37 @@
+"""MIME detection + supported-MIME gate (spec §6.1).
+
+Bytes-only; URL extensions are ignored because callers (Paperless, …)
+may serve `/download` routes without a file suffix. ``python-magic``
+reads the file header and returns the canonical MIME.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import magic
+
+from ix.errors import IXErrorCode, IXException
+
+SUPPORTED_MIMES: frozenset[str] = frozenset(
+    {
+        "application/pdf",
+        "image/png",
+        "image/jpeg",
+        "image/tiff",
+    }
+)
+
+
+def detect_mime(path: Path) -> str:
+    """Return the canonical MIME string for ``path`` (byte-sniffed)."""
+    return magic.from_file(str(path), mime=True)
+
+
+def require_supported(mime: str) -> None:
+    """Raise :class:`~ix.errors.IXException` (``IX_000_005``) if ``mime`` is unsupported."""
+    if mime not in SUPPORTED_MIMES:
+        raise IXException(IXErrorCode.IX_000_005, detail=mime)
+
+
+__all__ = ["SUPPORTED_MIMES", "detect_mime", "require_supported"]
--- a/src/ix/ingestion/pages.py
+++ b/src/ix/ingestion/pages.py
@ -0,0 +1,118 @@
+"""Turn downloaded files + raw texts into a flat :class:`Page` list (spec §6.1).
+
+PDFs → one :class:`Page` per page via PyMuPDF, with a hard 100 pages/PDF
+cap (``IX_000_006``). Images → Pillow; multi-frame TIFFs yield one Page
+per frame. Texts → one zero-dimension Page each so the downstream OCR /
+GenAI steps can still cite them.
+
+A parallel list of :class:`~ix.segmentation.PageMetadata` is returned so
+the pipeline (via :class:`SegmentIndex`) can resolve segment IDs back to
+``file_index`` anchors.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import fitz  # PyMuPDF
+from PIL import Image, ImageSequence
+
+from ix.contracts import Page
+from ix.errors import IXErrorCode, IXException
+from ix.segmentation import PageMetadata
+
+_PDF_PAGE_CAP = 100
+
+
+class DocumentIngestor:
+    """Builds the flat Page list that feeds :class:`~ix.pipeline.ocr_step.OCRStep`.
+
+    No constructor args for MVP — the 100-page cap is a spec constant. If
+    this needs to be tunable later, move it to a dataclass config.
+    """
+
+    def build_pages(
+        self,
+        files: list[tuple[Path, str]],
+        texts: list[str],
+    ) -> tuple[list[Page], list[PageMetadata]]:
+        """Return ``(pages, metas)`` in insertion order.
+
+        ``files`` is a list of ``(local_path, mime_type)`` tuples; mimes
+        have already been validated by :func:`ix.ingestion.mime.require_supported`.
+        """
+        pages: list[Page] = []
+        metas: list[PageMetadata] = []
+
+        for file_index, (path, mime) in enumerate(files):
+            if mime == "application/pdf":
+                self._extend_with_pdf(path, file_index, pages, metas)
+            elif mime in ("image/png", "image/jpeg", "image/tiff"):
+                self._extend_with_image(path, file_index, pages, metas)
+            else:  # pragma: no cover - defensive; require_supported should gate upstream
+                raise IXException(IXErrorCode.IX_000_005, detail=mime)
+
+        for _ in texts:
+            # Text-backed pages are zero-dim; they exist so the GenAIStep
+            # can merge their content into the prompt alongside OCR.
+            pages.append(
+                Page(
+                    page_no=len(pages) + 1,
+                    width=0.0,
+                    height=0.0,
+                    lines=[],
+                )
+            )
+            metas.append(PageMetadata(file_index=None))
+
+        return pages, metas
+
+    def _extend_with_pdf(
+        self,
+        path: Path,
+        file_index: int,
+        pages: list[Page],
+        metas: list[PageMetadata],
+    ) -> None:
+        doc = fitz.open(str(path))
+        try:
+            if doc.page_count > _PDF_PAGE_CAP:
+                raise IXException(
+                    IXErrorCode.IX_000_006,
+                    detail=f"{path}: {doc.page_count} pages (cap {_PDF_PAGE_CAP})",
+                )
+            for page in doc:
+                rect = page.rect
+                pages.append(
+                    Page(
+                        page_no=len(pages) + 1,
+                        width=float(rect.width),
+                        height=float(rect.height),
+                        lines=[],
+                    )
+                )
+                metas.append(PageMetadata(file_index=file_index))
+        finally:
+            doc.close()
+
+    def _extend_with_image(
+        self,
+        path: Path,
+        file_index: int,
+        pages: list[Page],
+        metas: list[PageMetadata],
+    ) -> None:
+        with Image.open(path) as img:
+            for frame in ImageSequence.Iterator(img):
+                pages.append(
+                    Page(
+                        page_no=len(pages) + 1,
+                        width=float(frame.width),
+                        height=float(frame.height),
+                        lines=[],
+                    )
+                )
+                metas.append(PageMetadata(file_index=file_index))
+
+
+__all__ = ["DocumentIngestor"]
--- a/tests/unit/test_ingestion_fetch.py
+++ b/tests/unit/test_ingestion_fetch.py
@ -0,0 +1,138 @@
+"""Tests for :func:`ix.ingestion.fetch.fetch_file` (spec §6.1)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+from pytest_httpx import HTTPXMock
+
+from ix.contracts import FileRef
+from ix.errors import IXErrorCode, IXException
+from ix.ingestion import FetchConfig, fetch_file
+
+
+@pytest.fixture
+def cfg() -> FetchConfig:
+    return FetchConfig(
+        connect_timeout_s=1.0,
+        read_timeout_s=2.0,
+        max_bytes=1024 * 1024,
+    )
+
+
+class TestSuccessPath:
+    async def test_downloads_with_auth_header_and_writes_to_tmp(
+        self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://paperless.local/doc/123/download"
+        httpx_mock.add_response(
+            url=url,
+            method="GET",
+            status_code=200,
+            content=b"%PDF-1.4 body",
+            headers={"content-type": "application/pdf"},
+        )
+        file_ref = FileRef(
+            url=url,
+            headers={"Authorization": "Token abc"},
+        )
+        path = await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert path.exists()
+        assert path.read_bytes() == b"%PDF-1.4 body"
+
+        # Confirm header went out.
+        reqs = httpx_mock.get_requests()
+        assert len(reqs) == 1
+        assert reqs[0].headers["Authorization"] == "Token abc"
+
+
+class TestNon2xx:
+    async def test_404_raises_IX_000_007(
+        self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://host.local/missing.pdf"
+        httpx_mock.add_response(url=url, status_code=404, content=b"")
+        file_ref = FileRef(url=url)
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
+        assert "404" in (ei.value.detail or "")
+
+    async def test_500_raises_IX_000_007(
+        self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://host.local/boom.pdf"
+        httpx_mock.add_response(url=url, status_code=500, content=b"oops")
+        file_ref = FileRef(url=url)
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
+
+
+class TestTimeout:
+    async def test_timeout_raises_IX_000_007(
+        self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://host.local/slow.pdf"
+        httpx_mock.add_exception(httpx.ReadTimeout("slow"), url=url)
+        file_ref = FileRef(url=url)
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
+
+
+class TestOversize:
+    async def test_oversize_raises_IX_000_007(
+        self, tmp_path: Path, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://host.local/big.pdf"
+        cfg = FetchConfig(
+            connect_timeout_s=1.0,
+            read_timeout_s=2.0,
+            max_bytes=100,
+        )
+        # 500 bytes of payload; cap is 100.
+        httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
+        file_ref = FileRef(url=url)
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
+
+    async def test_per_file_max_bytes_override(
+        self, tmp_path: Path, httpx_mock: HTTPXMock
+    ) -> None:
+        url = "https://host.local/mid.pdf"
+        cfg = FetchConfig(
+            connect_timeout_s=1.0,
+            read_timeout_s=2.0,
+            max_bytes=1_000_000,
+        )
+        # file_ref sets a tighter cap.
+        httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
+        file_ref = FileRef(url=url, max_bytes=100)
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
+
+
+class TestFileUrl:
+    async def test_file_scheme_reads_local(
+        self, tmp_path: Path, cfg: FetchConfig
+    ) -> None:
+        src = tmp_path / "in.pdf"
+        src.write_bytes(b"%PDF-1.4\nfile scheme content")
+        file_ref = FileRef(url=src.as_uri())
+        dst = await fetch_file(file_ref, tmp_dir=tmp_path / "out", cfg=cfg)
+        assert dst.exists()
+        assert dst.read_bytes() == b"%PDF-1.4\nfile scheme content"
+
+    async def test_file_scheme_missing_raises(
+        self, tmp_path: Path, cfg: FetchConfig
+    ) -> None:
+        missing = tmp_path / "nope.pdf"
+        file_ref = FileRef(url=missing.as_uri())
+        with pytest.raises(IXException) as ei:
+            await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
+        assert ei.value.code is IXErrorCode.IX_000_007
--- a/tests/unit/test_ingestion_mime.py
+++ b/tests/unit/test_ingestion_mime.py
@ -0,0 +1,96 @@
+"""Tests for MIME sniffing (spec §6.1)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from ix.errors import IXErrorCode, IXException
+from ix.ingestion import SUPPORTED_MIMES, detect_mime, require_supported
+
+# Real-header fixtures. python-magic looks at bytes, not extensions, so
+# these are the smallest valid-byte samples we can produce on the fly.
+
+_PDF_BYTES = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer<<>>\nstartxref\n0\n%%EOF\n"
+
+_PNG_BYTES = bytes.fromhex(
+    # PNG magic + minimal IHDR + IDAT + IEND (1x1 all-black).
+    "89504e470d0a1a0a"
+    "0000000d49484452"
+    "00000001000000010806000000"
+    "1f15c4890000000d"
+    "49444154789c6300010000000500010d0a2db400000000"
+    "49454e44ae426082"
+)
+
+_JPEG_BYTES = (
+    b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00"
+    b"\xff\xdb\x00C\x00" + b"\x08" * 64
+    + b"\xff\xc0\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00"
+    + b"\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00"
+    + b"\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b"
+    + b"\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xfb\xff\xd9"
+)
+
+
+def _make_tiff_bytes() -> bytes:
+    # Tiny valid TIFF via PIL.
+    from io import BytesIO
+
+    from PIL import Image
+
+    buf = BytesIO()
+    Image.new("L", (2, 2), color=0).save(buf, format="TIFF")
+    return buf.getvalue()
+
+
+_TIFF_BYTES = _make_tiff_bytes()
+
+
+@pytest.fixture
+def fixtures_dir(tmp_path: Path) -> Path:
+    d = tmp_path / "fixtures"
+    d.mkdir()
+    (d / "sample.pdf").write_bytes(_PDF_BYTES)
+    (d / "sample.png").write_bytes(_PNG_BYTES)
+    (d / "sample.jpg").write_bytes(_JPEG_BYTES)
+    (d / "sample.tif").write_bytes(_TIFF_BYTES)
+    (d / "sample.txt").write_bytes(b"this is plain text, no magic bytes\n")
+    return d
+
+
+class TestDetectMime:
+    def test_pdf(self, fixtures_dir: Path) -> None:
+        assert detect_mime(fixtures_dir / "sample.pdf") == "application/pdf"
+
+    def test_png(self, fixtures_dir: Path) -> None:
+        assert detect_mime(fixtures_dir / "sample.png") == "image/png"
+
+    def test_jpeg(self, fixtures_dir: Path) -> None:
+        assert detect_mime(fixtures_dir / "sample.jpg") == "image/jpeg"
+
+    def test_tiff(self, fixtures_dir: Path) -> None:
+        assert detect_mime(fixtures_dir / "sample.tif") == "image/tiff"
+
+
+class TestSupportedSet:
+    def test_supported_mimes_contents(self) -> None:
+        assert {
+            "application/pdf",
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+        } == set(SUPPORTED_MIMES)
+
+
+class TestRequireSupported:
+    def test_allows_supported(self) -> None:
+        for m in SUPPORTED_MIMES:
+            require_supported(m)  # no raise
+
+    def test_rejects_unsupported(self) -> None:
+        with pytest.raises(IXException) as ei:
+            require_supported("text/plain")
+        assert ei.value.code is IXErrorCode.IX_000_005
+        assert "text/plain" in (ei.value.detail or "")
--- a/tests/unit/test_ingestion_pages.py
+++ b/tests/unit/test_ingestion_pages.py
@ -0,0 +1,116 @@
+"""Tests for DocumentIngestor.build_pages (spec §6.1)."""
+
+from __future__ import annotations
+
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+
+from ix.errors import IXErrorCode, IXException
+from ix.ingestion import DocumentIngestor
+
+
+def _make_pdf_bytes(n_pages: int) -> bytes:
+    import fitz
+
+    doc = fitz.open()
+    for i in range(n_pages):
+        page = doc.new_page(width=200, height=300)
+        page.insert_text((10, 20), f"page {i+1}")
+    out = doc.tobytes()
+    doc.close()
+    return out
+
+
+def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes:
+    from PIL import Image
+
+    frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)]
+    buf = BytesIO()
+    frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
+    return buf.getvalue()
+
+
+class TestPdf:
+    def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None:
+        p = tmp_path / "doc.pdf"
+        p.write_bytes(_make_pdf_bytes(3))
+
+        ing = DocumentIngestor()
+        pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[])
+
+        assert len(pages) == 3
+        for i, page in enumerate(pages, start=1):
+            assert page.page_no == i
+            assert page.width > 0
+            assert page.height > 0
+        assert len(metas) == 3
+        for m in metas:
+            assert m.file_index == 0
+
+    def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None:
+        p = tmp_path / "toomany.pdf"
+        p.write_bytes(_make_pdf_bytes(101))
+        ing = DocumentIngestor()
+        with pytest.raises(IXException) as ei:
+            ing.build_pages(files=[(p, "application/pdf")], texts=[])
+        assert ei.value.code is IXErrorCode.IX_000_006
+
+
+class TestImages:
+    def test_single_frame_png(self, tmp_path: Path) -> None:
+        from PIL import Image
+
+        p = tmp_path / "img.png"
+        Image.new("RGB", (50, 80), color="white").save(p, format="PNG")
+        ing = DocumentIngestor()
+        pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[])
+        assert len(pages) == 1
+        assert pages[0].width == 50
+        assert pages[0].height == 80
+        assert metas[0].file_index == 0
+
+    def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None:
+        p = tmp_path / "multi.tif"
+        p.write_bytes(_make_multi_frame_tiff_bytes(2))
+
+        ing = DocumentIngestor()
+        pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[])
+        assert len(pages) == 2
+        for page in pages:
+            assert page.width == 10
+            assert page.height == 10
+        # Both frames share the same file_index.
+        assert {m.file_index for m in metas} == {0}
+
+
+class TestTexts:
+    def test_texts_become_pages(self) -> None:
+        ing = DocumentIngestor()
+        pages, metas = ing.build_pages(files=[], texts=["hello", "world"])
+        assert len(pages) == 2
+        assert pages[0].page_no == 1
+        assert pages[1].page_no == 2
+        # Text-backed pages have no file_index source.
+        assert metas[0].file_index is None
+        assert metas[1].file_index is None
+
+
+class TestFileIndexes:
+    def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None:
+        p1 = tmp_path / "a.pdf"
+        p1.write_bytes(_make_pdf_bytes(2))
+        p2 = tmp_path / "b.pdf"
+        p2.write_bytes(_make_pdf_bytes(1))
+
+        ing = DocumentIngestor()
+        pages, metas = ing.build_pages(
+            files=[(p1, "application/pdf"), (p2, "application/pdf")],
+            texts=[],
+        )
+        assert len(pages) == 3
+        # First two pages from file 0, last from file 1.
+        assert metas[0].file_index == 0
+        assert metas[1].file_index == 0
+        assert metas[2].file_index == 1