From 290e51416f6965377479805ad9c9bb0d219a2da7 Mon Sep 17 00:00:00 2001 From: Dirk Riemann Date: Sat, 18 Apr 2026 11:12:00 +0200 Subject: [PATCH] =?UTF-8?q?feat(ingestion):=20fetch=5Ffile=20+=20MIME=20sn?= =?UTF-8?q?iff=20+=20DocumentIngestor=20(spec=20=C2=A76.1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layered modules the SetupStep will wire together in Task 2.4. - fetch.py: async httpx fetch with configurable timeouts + incremental size cap (stream=True, accumulate bytes, raise IX_000_007 when exceeded). file:// URLs read locally. Auth headers pass through. The caller injects a FetchConfig — env reads happen in ix.config (Chunk 3). - mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset + require_supported(mime) helper that raises IX_000_005. - pages.py: DocumentIngestor.build_pages(files, texts) -> (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield multiple Pages), texts as zero-dim Pages so GenAIStep can still cite them. 21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx mapping, timeout -> IX_000_007, size cap enforced globally + per-file, file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF, require_supported gate, PDF/TIFF/text page counts, 101-page PDF -> IX_000_006, multi-file file_index assignment. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ix/ingestion/__init__.py | 27 ++++++ src/ix/ingestion/fetch.py | 144 +++++++++++++++++++++++++++++ src/ix/ingestion/mime.py | 37 ++++++++ src/ix/ingestion/pages.py | 118 +++++++++++++++++++++++ tests/unit/test_ingestion_fetch.py | 138 +++++++++++++++++++++++++++ tests/unit/test_ingestion_mime.py | 96 +++++++++++++++++++ tests/unit/test_ingestion_pages.py | 116 +++++++++++++++++++++++ 7 files changed, 676 insertions(+) create mode 100644 src/ix/ingestion/__init__.py create mode 100644 src/ix/ingestion/fetch.py create mode 100644 src/ix/ingestion/mime.py create mode 100644 src/ix/ingestion/pages.py create mode 100644 tests/unit/test_ingestion_fetch.py create mode 100644 tests/unit/test_ingestion_mime.py create mode 100644 tests/unit/test_ingestion_pages.py diff --git a/src/ix/ingestion/__init__.py b/src/ix/ingestion/__init__.py new file mode 100644 index 0000000..c90250a --- /dev/null +++ b/src/ix/ingestion/__init__.py @@ -0,0 +1,27 @@ +"""Ingestion pipeline helpers: fetch → MIME-detect → build pages. + +Three modules layered bottom-up: + +* :mod:`ix.ingestion.fetch` — async HTTP(S) / ``file://`` downloader with + incremental size caps and pluggable timeouts. +* :mod:`ix.ingestion.mime` — byte-sniffing MIME detection + the + MVP-supported MIME set. +* :mod:`ix.ingestion.pages` — :class:`DocumentIngestor` that turns local + files + raw texts into the flat :class:`~ix.contracts.Page` list the + OCR step expects. +""" + +from __future__ import annotations + +from ix.ingestion.fetch import FetchConfig, fetch_file +from ix.ingestion.mime import SUPPORTED_MIMES, detect_mime, require_supported +from ix.ingestion.pages import DocumentIngestor + +__all__ = [ + "SUPPORTED_MIMES", + "DocumentIngestor", + "FetchConfig", + "detect_mime", + "fetch_file", + "require_supported", +] diff --git a/src/ix/ingestion/fetch.py b/src/ix/ingestion/fetch.py new file mode 100644 index 0000000..f807db4 --- /dev/null +++ b/src/ix/ingestion/fetch.py @@ -0,0 +1,144 @@ +"""Async file fetcher (spec §6.1). + +Supports ``http(s)://`` URLs (via httpx with configurable connect/read +timeouts and an incremental size cap) and ``file://`` URLs (read from +local fs — used by the E2E fixture). Auth headers on the :class:`FileRef` +pass through unchanged. + +Every failure mode surfaces as :attr:`~ix.errors.IXErrorCode.IX_000_007` +with the offending URL + cause in the ``detail`` slot so the caller log +line is grep-friendly. + +Env-driven defaults live in :mod:`ix.config` (Chunk 3). The caller injects +a :class:`FetchConfig` — this module is purely mechanical. +""" + +from __future__ import annotations + +import urllib.parse +from dataclasses import dataclass +from pathlib import Path + +import httpx + +from ix.contracts import FileRef +from ix.errors import IXErrorCode, IXException + + +@dataclass(slots=True) +class FetchConfig: + """Per-fetch knobs injected by the caller. + + ``connect_timeout_s`` / ``read_timeout_s`` → httpx timeouts. + ``max_bytes`` is the pipeline-wide default cap; the per-file override + on :attr:`~ix.contracts.FileRef.max_bytes` wins when lower. + """ + + connect_timeout_s: float + read_timeout_s: float + max_bytes: int + + +def _effective_cap(file_ref: FileRef, cfg: FetchConfig) -> int: + """The smaller of the pipeline-wide cap and the per-file override.""" + if file_ref.max_bytes is None: + return cfg.max_bytes + return min(cfg.max_bytes, file_ref.max_bytes) + + +def _safe_filename(url: str) -> str: + """Derive a readable filename for the scratch copy from the URL.""" + parsed = urllib.parse.urlparse(url) + candidate = Path(parsed.path).name or "download" + # Strip anything that would escape the tmp dir. + return candidate.replace("/", "_").replace("\\", "_") + + +async def _fetch_http(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None: + """HTTP(S) download with incremental size cap. Raises IX_000_007 on any failure.""" + cap = _effective_cap(file_ref, cfg) + timeout = httpx.Timeout( + cfg.read_timeout_s, + connect=cfg.connect_timeout_s, + ) + try: + async with ( + httpx.AsyncClient(timeout=timeout) as client, + client.stream( + "GET", + file_ref.url, + headers=file_ref.headers or None, + ) as response, + ): + if response.status_code >= 300: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: HTTP {response.status_code}", + ) + total = 0 + with dst.open("wb") as fh: + async for chunk in response.aiter_bytes(): + total += len(chunk) + if total > cap: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: size cap {cap} bytes exceeded", + ) + fh.write(chunk) + except IXException: + raise + except httpx.TimeoutException as exc: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: timeout ({exc.__class__.__name__})", + ) from exc + except httpx.HTTPError as exc: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: {exc.__class__.__name__}: {exc}", + ) from exc + + +def _fetch_file_scheme(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None: + """Local-path read via ``file://`` URL. Same failure-mode contract.""" + cap = _effective_cap(file_ref, cfg) + src_path = Path(urllib.parse.urlparse(file_ref.url).path) + if not src_path.exists(): + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: file does not exist", + ) + size = src_path.stat().st_size + if size > cap: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: size cap {cap} bytes exceeded", + ) + dst.write_bytes(src_path.read_bytes()) + + +async def fetch_file(file_ref: FileRef, tmp_dir: Path, cfg: FetchConfig) -> Path: + """Download / copy ``file_ref`` into ``tmp_dir`` and return the local path. + + http(s) and file:// URLs both supported. Any fetch failure raises + :class:`~ix.errors.IXException` with + :attr:`~ix.errors.IXErrorCode.IX_000_007`. + """ + tmp_dir.mkdir(parents=True, exist_ok=True) + scheme = urllib.parse.urlparse(file_ref.url).scheme.lower() + dst = tmp_dir / _safe_filename(file_ref.url) + + if scheme in ("http", "https"): + await _fetch_http(file_ref, dst, cfg) + elif scheme == "file": + _fetch_file_scheme(file_ref, dst, cfg) + else: + raise IXException( + IXErrorCode.IX_000_007, + detail=f"{file_ref.url}: unsupported URL scheme {scheme!r}", + ) + + return dst + + +__all__ = ["FetchConfig", "fetch_file"] diff --git a/src/ix/ingestion/mime.py b/src/ix/ingestion/mime.py new file mode 100644 index 0000000..51aaa8b --- /dev/null +++ b/src/ix/ingestion/mime.py @@ -0,0 +1,37 @@ +"""MIME detection + supported-MIME gate (spec §6.1). + +Bytes-only; URL extensions are ignored because callers (Paperless, …) +may serve `/download` routes without a file suffix. ``python-magic`` +reads the file header and returns the canonical MIME. +""" + +from __future__ import annotations + +from pathlib import Path + +import magic + +from ix.errors import IXErrorCode, IXException + +SUPPORTED_MIMES: frozenset[str] = frozenset( + { + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + } +) + + +def detect_mime(path: Path) -> str: + """Return the canonical MIME string for ``path`` (byte-sniffed).""" + return magic.from_file(str(path), mime=True) + + +def require_supported(mime: str) -> None: + """Raise :class:`~ix.errors.IXException` (``IX_000_005``) if ``mime`` is unsupported.""" + if mime not in SUPPORTED_MIMES: + raise IXException(IXErrorCode.IX_000_005, detail=mime) + + +__all__ = ["SUPPORTED_MIMES", "detect_mime", "require_supported"] diff --git a/src/ix/ingestion/pages.py b/src/ix/ingestion/pages.py new file mode 100644 index 0000000..3dceb62 --- /dev/null +++ b/src/ix/ingestion/pages.py @@ -0,0 +1,118 @@ +"""Turn downloaded files + raw texts into a flat :class:`Page` list (spec §6.1). + +PDFs → one :class:`Page` per page via PyMuPDF, with a hard 100 pages/PDF +cap (``IX_000_006``). Images → Pillow; multi-frame TIFFs yield one Page +per frame. Texts → one zero-dimension Page each so the downstream OCR / +GenAI steps can still cite them. + +A parallel list of :class:`~ix.segmentation.PageMetadata` is returned so +the pipeline (via :class:`SegmentIndex`) can resolve segment IDs back to +``file_index`` anchors. +""" + +from __future__ import annotations + +from pathlib import Path + +import fitz # PyMuPDF +from PIL import Image, ImageSequence + +from ix.contracts import Page +from ix.errors import IXErrorCode, IXException +from ix.segmentation import PageMetadata + +_PDF_PAGE_CAP = 100 + + +class DocumentIngestor: + """Builds the flat Page list that feeds :class:`~ix.pipeline.ocr_step.OCRStep`. + + No constructor args for MVP — the 100-page cap is a spec constant. If + this needs to be tunable later, move it to a dataclass config. + """ + + def build_pages( + self, + files: list[tuple[Path, str]], + texts: list[str], + ) -> tuple[list[Page], list[PageMetadata]]: + """Return ``(pages, metas)`` in insertion order. + + ``files`` is a list of ``(local_path, mime_type)`` tuples; mimes + have already been validated by :func:`ix.ingestion.mime.require_supported`. + """ + pages: list[Page] = [] + metas: list[PageMetadata] = [] + + for file_index, (path, mime) in enumerate(files): + if mime == "application/pdf": + self._extend_with_pdf(path, file_index, pages, metas) + elif mime in ("image/png", "image/jpeg", "image/tiff"): + self._extend_with_image(path, file_index, pages, metas) + else: # pragma: no cover - defensive; require_supported should gate upstream + raise IXException(IXErrorCode.IX_000_005, detail=mime) + + for _ in texts: + # Text-backed pages are zero-dim; they exist so the GenAIStep + # can merge their content into the prompt alongside OCR. + pages.append( + Page( + page_no=len(pages) + 1, + width=0.0, + height=0.0, + lines=[], + ) + ) + metas.append(PageMetadata(file_index=None)) + + return pages, metas + + def _extend_with_pdf( + self, + path: Path, + file_index: int, + pages: list[Page], + metas: list[PageMetadata], + ) -> None: + doc = fitz.open(str(path)) + try: + if doc.page_count > _PDF_PAGE_CAP: + raise IXException( + IXErrorCode.IX_000_006, + detail=f"{path}: {doc.page_count} pages (cap {_PDF_PAGE_CAP})", + ) + for page in doc: + rect = page.rect + pages.append( + Page( + page_no=len(pages) + 1, + width=float(rect.width), + height=float(rect.height), + lines=[], + ) + ) + metas.append(PageMetadata(file_index=file_index)) + finally: + doc.close() + + def _extend_with_image( + self, + path: Path, + file_index: int, + pages: list[Page], + metas: list[PageMetadata], + ) -> None: + with Image.open(path) as img: + for frame in ImageSequence.Iterator(img): + pages.append( + Page( + page_no=len(pages) + 1, + width=float(frame.width), + height=float(frame.height), + lines=[], + ) + ) + metas.append(PageMetadata(file_index=file_index)) + + +__all__ = ["DocumentIngestor"] diff --git a/tests/unit/test_ingestion_fetch.py b/tests/unit/test_ingestion_fetch.py new file mode 100644 index 0000000..bf0a7d3 --- /dev/null +++ b/tests/unit/test_ingestion_fetch.py @@ -0,0 +1,138 @@ +"""Tests for :func:`ix.ingestion.fetch.fetch_file` (spec §6.1).""" + +from __future__ import annotations + +from pathlib import Path + +import httpx +import pytest +from pytest_httpx import HTTPXMock + +from ix.contracts import FileRef +from ix.errors import IXErrorCode, IXException +from ix.ingestion import FetchConfig, fetch_file + + +@pytest.fixture +def cfg() -> FetchConfig: + return FetchConfig( + connect_timeout_s=1.0, + read_timeout_s=2.0, + max_bytes=1024 * 1024, + ) + + +class TestSuccessPath: + async def test_downloads_with_auth_header_and_writes_to_tmp( + self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock + ) -> None: + url = "https://paperless.local/doc/123/download" + httpx_mock.add_response( + url=url, + method="GET", + status_code=200, + content=b"%PDF-1.4 body", + headers={"content-type": "application/pdf"}, + ) + file_ref = FileRef( + url=url, + headers={"Authorization": "Token abc"}, + ) + path = await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert path.exists() + assert path.read_bytes() == b"%PDF-1.4 body" + + # Confirm header went out. + reqs = httpx_mock.get_requests() + assert len(reqs) == 1 + assert reqs[0].headers["Authorization"] == "Token abc" + + +class TestNon2xx: + async def test_404_raises_IX_000_007( + self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock + ) -> None: + url = "https://host.local/missing.pdf" + httpx_mock.add_response(url=url, status_code=404, content=b"") + file_ref = FileRef(url=url) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 + assert "404" in (ei.value.detail or "") + + async def test_500_raises_IX_000_007( + self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock + ) -> None: + url = "https://host.local/boom.pdf" + httpx_mock.add_response(url=url, status_code=500, content=b"oops") + file_ref = FileRef(url=url) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 + + +class TestTimeout: + async def test_timeout_raises_IX_000_007( + self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock + ) -> None: + url = "https://host.local/slow.pdf" + httpx_mock.add_exception(httpx.ReadTimeout("slow"), url=url) + file_ref = FileRef(url=url) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 + + +class TestOversize: + async def test_oversize_raises_IX_000_007( + self, tmp_path: Path, httpx_mock: HTTPXMock + ) -> None: + url = "https://host.local/big.pdf" + cfg = FetchConfig( + connect_timeout_s=1.0, + read_timeout_s=2.0, + max_bytes=100, + ) + # 500 bytes of payload; cap is 100. + httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500) + file_ref = FileRef(url=url) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 + + async def test_per_file_max_bytes_override( + self, tmp_path: Path, httpx_mock: HTTPXMock + ) -> None: + url = "https://host.local/mid.pdf" + cfg = FetchConfig( + connect_timeout_s=1.0, + read_timeout_s=2.0, + max_bytes=1_000_000, + ) + # file_ref sets a tighter cap. + httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500) + file_ref = FileRef(url=url, max_bytes=100) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 + + +class TestFileUrl: + async def test_file_scheme_reads_local( + self, tmp_path: Path, cfg: FetchConfig + ) -> None: + src = tmp_path / "in.pdf" + src.write_bytes(b"%PDF-1.4\nfile scheme content") + file_ref = FileRef(url=src.as_uri()) + dst = await fetch_file(file_ref, tmp_dir=tmp_path / "out", cfg=cfg) + assert dst.exists() + assert dst.read_bytes() == b"%PDF-1.4\nfile scheme content" + + async def test_file_scheme_missing_raises( + self, tmp_path: Path, cfg: FetchConfig + ) -> None: + missing = tmp_path / "nope.pdf" + file_ref = FileRef(url=missing.as_uri()) + with pytest.raises(IXException) as ei: + await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg) + assert ei.value.code is IXErrorCode.IX_000_007 diff --git a/tests/unit/test_ingestion_mime.py b/tests/unit/test_ingestion_mime.py new file mode 100644 index 0000000..0ae2048 --- /dev/null +++ b/tests/unit/test_ingestion_mime.py @@ -0,0 +1,96 @@ +"""Tests for MIME sniffing (spec §6.1).""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from ix.errors import IXErrorCode, IXException +from ix.ingestion import SUPPORTED_MIMES, detect_mime, require_supported + +# Real-header fixtures. python-magic looks at bytes, not extensions, so +# these are the smallest valid-byte samples we can produce on the fly. + +_PDF_BYTES = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer<<>>\nstartxref\n0\n%%EOF\n" + +_PNG_BYTES = bytes.fromhex( + # PNG magic + minimal IHDR + IDAT + IEND (1x1 all-black). + "89504e470d0a1a0a" + "0000000d49484452" + "00000001000000010806000000" + "1f15c4890000000d" + "49444154789c6300010000000500010d0a2db400000000" + "49454e44ae426082" +) + +_JPEG_BYTES = ( + b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00" + b"\xff\xdb\x00C\x00" + b"\x08" * 64 + + b"\xff\xc0\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00" + + b"\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00" + + b"\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b" + + b"\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xfb\xff\xd9" +) + + +def _make_tiff_bytes() -> bytes: + # Tiny valid TIFF via PIL. + from io import BytesIO + + from PIL import Image + + buf = BytesIO() + Image.new("L", (2, 2), color=0).save(buf, format="TIFF") + return buf.getvalue() + + +_TIFF_BYTES = _make_tiff_bytes() + + +@pytest.fixture +def fixtures_dir(tmp_path: Path) -> Path: + d = tmp_path / "fixtures" + d.mkdir() + (d / "sample.pdf").write_bytes(_PDF_BYTES) + (d / "sample.png").write_bytes(_PNG_BYTES) + (d / "sample.jpg").write_bytes(_JPEG_BYTES) + (d / "sample.tif").write_bytes(_TIFF_BYTES) + (d / "sample.txt").write_bytes(b"this is plain text, no magic bytes\n") + return d + + +class TestDetectMime: + def test_pdf(self, fixtures_dir: Path) -> None: + assert detect_mime(fixtures_dir / "sample.pdf") == "application/pdf" + + def test_png(self, fixtures_dir: Path) -> None: + assert detect_mime(fixtures_dir / "sample.png") == "image/png" + + def test_jpeg(self, fixtures_dir: Path) -> None: + assert detect_mime(fixtures_dir / "sample.jpg") == "image/jpeg" + + def test_tiff(self, fixtures_dir: Path) -> None: + assert detect_mime(fixtures_dir / "sample.tif") == "image/tiff" + + +class TestSupportedSet: + def test_supported_mimes_contents(self) -> None: + assert { + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + } == set(SUPPORTED_MIMES) + + +class TestRequireSupported: + def test_allows_supported(self) -> None: + for m in SUPPORTED_MIMES: + require_supported(m) # no raise + + def test_rejects_unsupported(self) -> None: + with pytest.raises(IXException) as ei: + require_supported("text/plain") + assert ei.value.code is IXErrorCode.IX_000_005 + assert "text/plain" in (ei.value.detail or "") diff --git a/tests/unit/test_ingestion_pages.py b/tests/unit/test_ingestion_pages.py new file mode 100644 index 0000000..a25a5c3 --- /dev/null +++ b/tests/unit/test_ingestion_pages.py @@ -0,0 +1,116 @@ +"""Tests for DocumentIngestor.build_pages (spec §6.1).""" + +from __future__ import annotations + +from io import BytesIO +from pathlib import Path + +import pytest + +from ix.errors import IXErrorCode, IXException +from ix.ingestion import DocumentIngestor + + +def _make_pdf_bytes(n_pages: int) -> bytes: + import fitz + + doc = fitz.open() + for i in range(n_pages): + page = doc.new_page(width=200, height=300) + page.insert_text((10, 20), f"page {i+1}") + out = doc.tobytes() + doc.close() + return out + + +def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes: + from PIL import Image + + frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)] + buf = BytesIO() + frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:]) + return buf.getvalue() + + +class TestPdf: + def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None: + p = tmp_path / "doc.pdf" + p.write_bytes(_make_pdf_bytes(3)) + + ing = DocumentIngestor() + pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[]) + + assert len(pages) == 3 + for i, page in enumerate(pages, start=1): + assert page.page_no == i + assert page.width > 0 + assert page.height > 0 + assert len(metas) == 3 + for m in metas: + assert m.file_index == 0 + + def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None: + p = tmp_path / "toomany.pdf" + p.write_bytes(_make_pdf_bytes(101)) + ing = DocumentIngestor() + with pytest.raises(IXException) as ei: + ing.build_pages(files=[(p, "application/pdf")], texts=[]) + assert ei.value.code is IXErrorCode.IX_000_006 + + +class TestImages: + def test_single_frame_png(self, tmp_path: Path) -> None: + from PIL import Image + + p = tmp_path / "img.png" + Image.new("RGB", (50, 80), color="white").save(p, format="PNG") + ing = DocumentIngestor() + pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[]) + assert len(pages) == 1 + assert pages[0].width == 50 + assert pages[0].height == 80 + assert metas[0].file_index == 0 + + def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None: + p = tmp_path / "multi.tif" + p.write_bytes(_make_multi_frame_tiff_bytes(2)) + + ing = DocumentIngestor() + pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[]) + assert len(pages) == 2 + for page in pages: + assert page.width == 10 + assert page.height == 10 + # Both frames share the same file_index. + assert {m.file_index for m in metas} == {0} + + +class TestTexts: + def test_texts_become_pages(self) -> None: + ing = DocumentIngestor() + pages, metas = ing.build_pages(files=[], texts=["hello", "world"]) + assert len(pages) == 2 + assert pages[0].page_no == 1 + assert pages[1].page_no == 2 + # Text-backed pages have no file_index source. + assert metas[0].file_index is None + assert metas[1].file_index is None + + +class TestFileIndexes: + def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None: + p1 = tmp_path / "a.pdf" + p1.write_bytes(_make_pdf_bytes(2)) + p2 = tmp_path / "b.pdf" + p2.write_bytes(_make_pdf_bytes(1)) + + ing = DocumentIngestor() + pages, metas = ing.build_pages( + files=[(p1, "application/pdf"), (p2, "application/pdf")], + texts=[], + ) + assert len(pages) == 3 + # First two pages from file 0, last from file 1. + assert metas[0].file_index == 0 + assert metas[1].file_index == 0 + assert metas[2].file_index == 1