Three layered modules the SetupStep will wire together in Task 2.4. - fetch.py: async httpx fetch with configurable timeouts + incremental size cap (stream=True, accumulate bytes, raise IX_000_007 when exceeded). file:// URLs read locally. Auth headers pass through. The caller injects a FetchConfig — env reads happen in ix.config (Chunk 3). - mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset + require_supported(mime) helper that raises IX_000_005. - pages.py: DocumentIngestor.build_pages(files, texts) -> (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield multiple Pages), texts as zero-dim Pages so GenAIStep can still cite them. 21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx mapping, timeout -> IX_000_007, size cap enforced globally + per-file, file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF, require_supported gate, PDF/TIFF/text page counts, 101-page PDF -> IX_000_006, multi-file file_index assignment. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
96 lines
2.9 KiB
Python
96 lines
2.9 KiB
Python
"""Tests for MIME sniffing (spec §6.1)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from ix.errors import IXErrorCode, IXException
|
|
from ix.ingestion import SUPPORTED_MIMES, detect_mime, require_supported
|
|
|
|
# Real-header fixtures. python-magic looks at bytes, not extensions, so
|
|
# these are the smallest valid-byte samples we can produce on the fly.
|
|
|
|
_PDF_BYTES = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer<<>>\nstartxref\n0\n%%EOF\n"
|
|
|
|
_PNG_BYTES = bytes.fromhex(
|
|
# PNG magic + minimal IHDR + IDAT + IEND (1x1 all-black).
|
|
"89504e470d0a1a0a"
|
|
"0000000d49484452"
|
|
"00000001000000010806000000"
|
|
"1f15c4890000000d"
|
|
"49444154789c6300010000000500010d0a2db400000000"
|
|
"49454e44ae426082"
|
|
)
|
|
|
|
_JPEG_BYTES = (
|
|
b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00"
|
|
b"\xff\xdb\x00C\x00" + b"\x08" * 64
|
|
+ b"\xff\xc0\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00"
|
|
+ b"\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00"
|
|
+ b"\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b"
|
|
+ b"\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xfb\xff\xd9"
|
|
)
|
|
|
|
|
|
def _make_tiff_bytes() -> bytes:
|
|
# Tiny valid TIFF via PIL.
|
|
from io import BytesIO
|
|
|
|
from PIL import Image
|
|
|
|
buf = BytesIO()
|
|
Image.new("L", (2, 2), color=0).save(buf, format="TIFF")
|
|
return buf.getvalue()
|
|
|
|
|
|
_TIFF_BYTES = _make_tiff_bytes()
|
|
|
|
|
|
@pytest.fixture
|
|
def fixtures_dir(tmp_path: Path) -> Path:
|
|
d = tmp_path / "fixtures"
|
|
d.mkdir()
|
|
(d / "sample.pdf").write_bytes(_PDF_BYTES)
|
|
(d / "sample.png").write_bytes(_PNG_BYTES)
|
|
(d / "sample.jpg").write_bytes(_JPEG_BYTES)
|
|
(d / "sample.tif").write_bytes(_TIFF_BYTES)
|
|
(d / "sample.txt").write_bytes(b"this is plain text, no magic bytes\n")
|
|
return d
|
|
|
|
|
|
class TestDetectMime:
|
|
def test_pdf(self, fixtures_dir: Path) -> None:
|
|
assert detect_mime(fixtures_dir / "sample.pdf") == "application/pdf"
|
|
|
|
def test_png(self, fixtures_dir: Path) -> None:
|
|
assert detect_mime(fixtures_dir / "sample.png") == "image/png"
|
|
|
|
def test_jpeg(self, fixtures_dir: Path) -> None:
|
|
assert detect_mime(fixtures_dir / "sample.jpg") == "image/jpeg"
|
|
|
|
def test_tiff(self, fixtures_dir: Path) -> None:
|
|
assert detect_mime(fixtures_dir / "sample.tif") == "image/tiff"
|
|
|
|
|
|
class TestSupportedSet:
|
|
def test_supported_mimes_contents(self) -> None:
|
|
assert {
|
|
"application/pdf",
|
|
"image/png",
|
|
"image/jpeg",
|
|
"image/tiff",
|
|
} == set(SUPPORTED_MIMES)
|
|
|
|
|
|
class TestRequireSupported:
|
|
def test_allows_supported(self) -> None:
|
|
for m in SUPPORTED_MIMES:
|
|
require_supported(m) # no raise
|
|
|
|
def test_rejects_unsupported(self) -> None:
|
|
with pytest.raises(IXException) as ei:
|
|
require_supported("text/plain")
|
|
assert ei.value.code is IXErrorCode.IX_000_005
|
|
assert "text/plain" in (ei.value.detail or "")
|