Merge pull request 'feat(ingestion): fetch_file + MIME sniff + DocumentIngestor' (#11) from feat/ingestion into main
Some checks are pending
tests / test (push) Waiting to run
Some checks are pending
tests / test (push) Waiting to run
This commit is contained in:
commit
d801038c74
7 changed files with 676 additions and 0 deletions
27
src/ix/ingestion/__init__.py
Normal file
27
src/ix/ingestion/__init__.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
"""Ingestion pipeline helpers: fetch → MIME-detect → build pages.
|
||||
|
||||
Three modules layered bottom-up:
|
||||
|
||||
* :mod:`ix.ingestion.fetch` — async HTTP(S) / ``file://`` downloader with
|
||||
incremental size caps and pluggable timeouts.
|
||||
* :mod:`ix.ingestion.mime` — byte-sniffing MIME detection + the
|
||||
MVP-supported MIME set.
|
||||
* :mod:`ix.ingestion.pages` — :class:`DocumentIngestor` that turns local
|
||||
files + raw texts into the flat :class:`~ix.contracts.Page` list the
|
||||
OCR step expects.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ix.ingestion.fetch import FetchConfig, fetch_file
|
||||
from ix.ingestion.mime import SUPPORTED_MIMES, detect_mime, require_supported
|
||||
from ix.ingestion.pages import DocumentIngestor
|
||||
|
||||
__all__ = [
|
||||
"SUPPORTED_MIMES",
|
||||
"DocumentIngestor",
|
||||
"FetchConfig",
|
||||
"detect_mime",
|
||||
"fetch_file",
|
||||
"require_supported",
|
||||
]
|
||||
144
src/ix/ingestion/fetch.py
Normal file
144
src/ix/ingestion/fetch.py
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
"""Async file fetcher (spec §6.1).
|
||||
|
||||
Supports ``http(s)://`` URLs (via httpx with configurable connect/read
|
||||
timeouts and an incremental size cap) and ``file://`` URLs (read from
|
||||
local fs — used by the E2E fixture). Auth headers on the :class:`FileRef`
|
||||
pass through unchanged.
|
||||
|
||||
Every failure mode surfaces as :attr:`~ix.errors.IXErrorCode.IX_000_007`
|
||||
with the offending URL + cause in the ``detail`` slot so the caller log
|
||||
line is grep-friendly.
|
||||
|
||||
Env-driven defaults live in :mod:`ix.config` (Chunk 3). The caller injects
|
||||
a :class:`FetchConfig` — this module is purely mechanical.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
from ix.contracts import FileRef
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class FetchConfig:
|
||||
"""Per-fetch knobs injected by the caller.
|
||||
|
||||
``connect_timeout_s`` / ``read_timeout_s`` → httpx timeouts.
|
||||
``max_bytes`` is the pipeline-wide default cap; the per-file override
|
||||
on :attr:`~ix.contracts.FileRef.max_bytes` wins when lower.
|
||||
"""
|
||||
|
||||
connect_timeout_s: float
|
||||
read_timeout_s: float
|
||||
max_bytes: int
|
||||
|
||||
|
||||
def _effective_cap(file_ref: FileRef, cfg: FetchConfig) -> int:
|
||||
"""The smaller of the pipeline-wide cap and the per-file override."""
|
||||
if file_ref.max_bytes is None:
|
||||
return cfg.max_bytes
|
||||
return min(cfg.max_bytes, file_ref.max_bytes)
|
||||
|
||||
|
||||
def _safe_filename(url: str) -> str:
|
||||
"""Derive a readable filename for the scratch copy from the URL."""
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
candidate = Path(parsed.path).name or "download"
|
||||
# Strip anything that would escape the tmp dir.
|
||||
return candidate.replace("/", "_").replace("\\", "_")
|
||||
|
||||
|
||||
async def _fetch_http(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None:
|
||||
"""HTTP(S) download with incremental size cap. Raises IX_000_007 on any failure."""
|
||||
cap = _effective_cap(file_ref, cfg)
|
||||
timeout = httpx.Timeout(
|
||||
cfg.read_timeout_s,
|
||||
connect=cfg.connect_timeout_s,
|
||||
)
|
||||
try:
|
||||
async with (
|
||||
httpx.AsyncClient(timeout=timeout) as client,
|
||||
client.stream(
|
||||
"GET",
|
||||
file_ref.url,
|
||||
headers=file_ref.headers or None,
|
||||
) as response,
|
||||
):
|
||||
if response.status_code >= 300:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: HTTP {response.status_code}",
|
||||
)
|
||||
total = 0
|
||||
with dst.open("wb") as fh:
|
||||
async for chunk in response.aiter_bytes():
|
||||
total += len(chunk)
|
||||
if total > cap:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: size cap {cap} bytes exceeded",
|
||||
)
|
||||
fh.write(chunk)
|
||||
except IXException:
|
||||
raise
|
||||
except httpx.TimeoutException as exc:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: timeout ({exc.__class__.__name__})",
|
||||
) from exc
|
||||
except httpx.HTTPError as exc:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: {exc.__class__.__name__}: {exc}",
|
||||
) from exc
|
||||
|
||||
|
||||
def _fetch_file_scheme(file_ref: FileRef, dst: Path, cfg: FetchConfig) -> None:
|
||||
"""Local-path read via ``file://`` URL. Same failure-mode contract."""
|
||||
cap = _effective_cap(file_ref, cfg)
|
||||
src_path = Path(urllib.parse.urlparse(file_ref.url).path)
|
||||
if not src_path.exists():
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: file does not exist",
|
||||
)
|
||||
size = src_path.stat().st_size
|
||||
if size > cap:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: size cap {cap} bytes exceeded",
|
||||
)
|
||||
dst.write_bytes(src_path.read_bytes())
|
||||
|
||||
|
||||
async def fetch_file(file_ref: FileRef, tmp_dir: Path, cfg: FetchConfig) -> Path:
|
||||
"""Download / copy ``file_ref`` into ``tmp_dir`` and return the local path.
|
||||
|
||||
http(s) and file:// URLs both supported. Any fetch failure raises
|
||||
:class:`~ix.errors.IXException` with
|
||||
:attr:`~ix.errors.IXErrorCode.IX_000_007`.
|
||||
"""
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
scheme = urllib.parse.urlparse(file_ref.url).scheme.lower()
|
||||
dst = tmp_dir / _safe_filename(file_ref.url)
|
||||
|
||||
if scheme in ("http", "https"):
|
||||
await _fetch_http(file_ref, dst, cfg)
|
||||
elif scheme == "file":
|
||||
_fetch_file_scheme(file_ref, dst, cfg)
|
||||
else:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_007,
|
||||
detail=f"{file_ref.url}: unsupported URL scheme {scheme!r}",
|
||||
)
|
||||
|
||||
return dst
|
||||
|
||||
|
||||
__all__ = ["FetchConfig", "fetch_file"]
|
||||
37
src/ix/ingestion/mime.py
Normal file
37
src/ix/ingestion/mime.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""MIME detection + supported-MIME gate (spec §6.1).
|
||||
|
||||
Bytes-only; URL extensions are ignored because callers (Paperless, …)
|
||||
may serve `/download` routes without a file suffix. ``python-magic``
|
||||
reads the file header and returns the canonical MIME.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import magic
|
||||
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
|
||||
SUPPORTED_MIMES: frozenset[str] = frozenset(
|
||||
{
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def detect_mime(path: Path) -> str:
|
||||
"""Return the canonical MIME string for ``path`` (byte-sniffed)."""
|
||||
return magic.from_file(str(path), mime=True)
|
||||
|
||||
|
||||
def require_supported(mime: str) -> None:
|
||||
"""Raise :class:`~ix.errors.IXException` (``IX_000_005``) if ``mime`` is unsupported."""
|
||||
if mime not in SUPPORTED_MIMES:
|
||||
raise IXException(IXErrorCode.IX_000_005, detail=mime)
|
||||
|
||||
|
||||
__all__ = ["SUPPORTED_MIMES", "detect_mime", "require_supported"]
|
||||
118
src/ix/ingestion/pages.py
Normal file
118
src/ix/ingestion/pages.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
"""Turn downloaded files + raw texts into a flat :class:`Page` list (spec §6.1).
|
||||
|
||||
PDFs → one :class:`Page` per page via PyMuPDF, with a hard 100 pages/PDF
|
||||
cap (``IX_000_006``). Images → Pillow; multi-frame TIFFs yield one Page
|
||||
per frame. Texts → one zero-dimension Page each so the downstream OCR /
|
||||
GenAI steps can still cite them.
|
||||
|
||||
A parallel list of :class:`~ix.segmentation.PageMetadata` is returned so
|
||||
the pipeline (via :class:`SegmentIndex`) can resolve segment IDs back to
|
||||
``file_index`` anchors.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image, ImageSequence
|
||||
|
||||
from ix.contracts import Page
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
from ix.segmentation import PageMetadata
|
||||
|
||||
_PDF_PAGE_CAP = 100
|
||||
|
||||
|
||||
class DocumentIngestor:
|
||||
"""Builds the flat Page list that feeds :class:`~ix.pipeline.ocr_step.OCRStep`.
|
||||
|
||||
No constructor args for MVP — the 100-page cap is a spec constant. If
|
||||
this needs to be tunable later, move it to a dataclass config.
|
||||
"""
|
||||
|
||||
def build_pages(
|
||||
self,
|
||||
files: list[tuple[Path, str]],
|
||||
texts: list[str],
|
||||
) -> tuple[list[Page], list[PageMetadata]]:
|
||||
"""Return ``(pages, metas)`` in insertion order.
|
||||
|
||||
``files`` is a list of ``(local_path, mime_type)`` tuples; mimes
|
||||
have already been validated by :func:`ix.ingestion.mime.require_supported`.
|
||||
"""
|
||||
pages: list[Page] = []
|
||||
metas: list[PageMetadata] = []
|
||||
|
||||
for file_index, (path, mime) in enumerate(files):
|
||||
if mime == "application/pdf":
|
||||
self._extend_with_pdf(path, file_index, pages, metas)
|
||||
elif mime in ("image/png", "image/jpeg", "image/tiff"):
|
||||
self._extend_with_image(path, file_index, pages, metas)
|
||||
else: # pragma: no cover - defensive; require_supported should gate upstream
|
||||
raise IXException(IXErrorCode.IX_000_005, detail=mime)
|
||||
|
||||
for _ in texts:
|
||||
# Text-backed pages are zero-dim; they exist so the GenAIStep
|
||||
# can merge their content into the prompt alongside OCR.
|
||||
pages.append(
|
||||
Page(
|
||||
page_no=len(pages) + 1,
|
||||
width=0.0,
|
||||
height=0.0,
|
||||
lines=[],
|
||||
)
|
||||
)
|
||||
metas.append(PageMetadata(file_index=None))
|
||||
|
||||
return pages, metas
|
||||
|
||||
def _extend_with_pdf(
|
||||
self,
|
||||
path: Path,
|
||||
file_index: int,
|
||||
pages: list[Page],
|
||||
metas: list[PageMetadata],
|
||||
) -> None:
|
||||
doc = fitz.open(str(path))
|
||||
try:
|
||||
if doc.page_count > _PDF_PAGE_CAP:
|
||||
raise IXException(
|
||||
IXErrorCode.IX_000_006,
|
||||
detail=f"{path}: {doc.page_count} pages (cap {_PDF_PAGE_CAP})",
|
||||
)
|
||||
for page in doc:
|
||||
rect = page.rect
|
||||
pages.append(
|
||||
Page(
|
||||
page_no=len(pages) + 1,
|
||||
width=float(rect.width),
|
||||
height=float(rect.height),
|
||||
lines=[],
|
||||
)
|
||||
)
|
||||
metas.append(PageMetadata(file_index=file_index))
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def _extend_with_image(
|
||||
self,
|
||||
path: Path,
|
||||
file_index: int,
|
||||
pages: list[Page],
|
||||
metas: list[PageMetadata],
|
||||
) -> None:
|
||||
with Image.open(path) as img:
|
||||
for frame in ImageSequence.Iterator(img):
|
||||
pages.append(
|
||||
Page(
|
||||
page_no=len(pages) + 1,
|
||||
width=float(frame.width),
|
||||
height=float(frame.height),
|
||||
lines=[],
|
||||
)
|
||||
)
|
||||
metas.append(PageMetadata(file_index=file_index))
|
||||
|
||||
|
||||
__all__ = ["DocumentIngestor"]
|
||||
138
tests/unit/test_ingestion_fetch.py
Normal file
138
tests/unit/test_ingestion_fetch.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
"""Tests for :func:`ix.ingestion.fetch.fetch_file` (spec §6.1)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
from pytest_httpx import HTTPXMock
|
||||
|
||||
from ix.contracts import FileRef
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
from ix.ingestion import FetchConfig, fetch_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cfg() -> FetchConfig:
|
||||
return FetchConfig(
|
||||
connect_timeout_s=1.0,
|
||||
read_timeout_s=2.0,
|
||||
max_bytes=1024 * 1024,
|
||||
)
|
||||
|
||||
|
||||
class TestSuccessPath:
|
||||
async def test_downloads_with_auth_header_and_writes_to_tmp(
|
||||
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://paperless.local/doc/123/download"
|
||||
httpx_mock.add_response(
|
||||
url=url,
|
||||
method="GET",
|
||||
status_code=200,
|
||||
content=b"%PDF-1.4 body",
|
||||
headers={"content-type": "application/pdf"},
|
||||
)
|
||||
file_ref = FileRef(
|
||||
url=url,
|
||||
headers={"Authorization": "Token abc"},
|
||||
)
|
||||
path = await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert path.exists()
|
||||
assert path.read_bytes() == b"%PDF-1.4 body"
|
||||
|
||||
# Confirm header went out.
|
||||
reqs = httpx_mock.get_requests()
|
||||
assert len(reqs) == 1
|
||||
assert reqs[0].headers["Authorization"] == "Token abc"
|
||||
|
||||
|
||||
class TestNon2xx:
|
||||
async def test_404_raises_IX_000_007(
|
||||
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://host.local/missing.pdf"
|
||||
httpx_mock.add_response(url=url, status_code=404, content=b"")
|
||||
file_ref = FileRef(url=url)
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
assert "404" in (ei.value.detail or "")
|
||||
|
||||
async def test_500_raises_IX_000_007(
|
||||
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://host.local/boom.pdf"
|
||||
httpx_mock.add_response(url=url, status_code=500, content=b"oops")
|
||||
file_ref = FileRef(url=url)
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
|
||||
|
||||
class TestTimeout:
|
||||
async def test_timeout_raises_IX_000_007(
|
||||
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://host.local/slow.pdf"
|
||||
httpx_mock.add_exception(httpx.ReadTimeout("slow"), url=url)
|
||||
file_ref = FileRef(url=url)
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
|
||||
|
||||
class TestOversize:
|
||||
async def test_oversize_raises_IX_000_007(
|
||||
self, tmp_path: Path, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://host.local/big.pdf"
|
||||
cfg = FetchConfig(
|
||||
connect_timeout_s=1.0,
|
||||
read_timeout_s=2.0,
|
||||
max_bytes=100,
|
||||
)
|
||||
# 500 bytes of payload; cap is 100.
|
||||
httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
|
||||
file_ref = FileRef(url=url)
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
|
||||
async def test_per_file_max_bytes_override(
|
||||
self, tmp_path: Path, httpx_mock: HTTPXMock
|
||||
) -> None:
|
||||
url = "https://host.local/mid.pdf"
|
||||
cfg = FetchConfig(
|
||||
connect_timeout_s=1.0,
|
||||
read_timeout_s=2.0,
|
||||
max_bytes=1_000_000,
|
||||
)
|
||||
# file_ref sets a tighter cap.
|
||||
httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
|
||||
file_ref = FileRef(url=url, max_bytes=100)
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
|
||||
|
||||
class TestFileUrl:
|
||||
async def test_file_scheme_reads_local(
|
||||
self, tmp_path: Path, cfg: FetchConfig
|
||||
) -> None:
|
||||
src = tmp_path / "in.pdf"
|
||||
src.write_bytes(b"%PDF-1.4\nfile scheme content")
|
||||
file_ref = FileRef(url=src.as_uri())
|
||||
dst = await fetch_file(file_ref, tmp_dir=tmp_path / "out", cfg=cfg)
|
||||
assert dst.exists()
|
||||
assert dst.read_bytes() == b"%PDF-1.4\nfile scheme content"
|
||||
|
||||
async def test_file_scheme_missing_raises(
|
||||
self, tmp_path: Path, cfg: FetchConfig
|
||||
) -> None:
|
||||
missing = tmp_path / "nope.pdf"
|
||||
file_ref = FileRef(url=missing.as_uri())
|
||||
with pytest.raises(IXException) as ei:
|
||||
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
||||
assert ei.value.code is IXErrorCode.IX_000_007
|
||||
96
tests/unit/test_ingestion_mime.py
Normal file
96
tests/unit/test_ingestion_mime.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
"""Tests for MIME sniffing (spec §6.1)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
from ix.ingestion import SUPPORTED_MIMES, detect_mime, require_supported
|
||||
|
||||
# Real-header fixtures. python-magic looks at bytes, not extensions, so
|
||||
# these are the smallest valid-byte samples we can produce on the fly.
|
||||
|
||||
_PDF_BYTES = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer<<>>\nstartxref\n0\n%%EOF\n"
|
||||
|
||||
_PNG_BYTES = bytes.fromhex(
|
||||
# PNG magic + minimal IHDR + IDAT + IEND (1x1 all-black).
|
||||
"89504e470d0a1a0a"
|
||||
"0000000d49484452"
|
||||
"00000001000000010806000000"
|
||||
"1f15c4890000000d"
|
||||
"49444154789c6300010000000500010d0a2db400000000"
|
||||
"49454e44ae426082"
|
||||
)
|
||||
|
||||
_JPEG_BYTES = (
|
||||
b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00"
|
||||
b"\xff\xdb\x00C\x00" + b"\x08" * 64
|
||||
+ b"\xff\xc0\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00"
|
||||
+ b"\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00"
|
||||
+ b"\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b"
|
||||
+ b"\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xfb\xff\xd9"
|
||||
)
|
||||
|
||||
|
||||
def _make_tiff_bytes() -> bytes:
|
||||
# Tiny valid TIFF via PIL.
|
||||
from io import BytesIO
|
||||
|
||||
from PIL import Image
|
||||
|
||||
buf = BytesIO()
|
||||
Image.new("L", (2, 2), color=0).save(buf, format="TIFF")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
_TIFF_BYTES = _make_tiff_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fixtures_dir(tmp_path: Path) -> Path:
|
||||
d = tmp_path / "fixtures"
|
||||
d.mkdir()
|
||||
(d / "sample.pdf").write_bytes(_PDF_BYTES)
|
||||
(d / "sample.png").write_bytes(_PNG_BYTES)
|
||||
(d / "sample.jpg").write_bytes(_JPEG_BYTES)
|
||||
(d / "sample.tif").write_bytes(_TIFF_BYTES)
|
||||
(d / "sample.txt").write_bytes(b"this is plain text, no magic bytes\n")
|
||||
return d
|
||||
|
||||
|
||||
class TestDetectMime:
|
||||
def test_pdf(self, fixtures_dir: Path) -> None:
|
||||
assert detect_mime(fixtures_dir / "sample.pdf") == "application/pdf"
|
||||
|
||||
def test_png(self, fixtures_dir: Path) -> None:
|
||||
assert detect_mime(fixtures_dir / "sample.png") == "image/png"
|
||||
|
||||
def test_jpeg(self, fixtures_dir: Path) -> None:
|
||||
assert detect_mime(fixtures_dir / "sample.jpg") == "image/jpeg"
|
||||
|
||||
def test_tiff(self, fixtures_dir: Path) -> None:
|
||||
assert detect_mime(fixtures_dir / "sample.tif") == "image/tiff"
|
||||
|
||||
|
||||
class TestSupportedSet:
|
||||
def test_supported_mimes_contents(self) -> None:
|
||||
assert {
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
} == set(SUPPORTED_MIMES)
|
||||
|
||||
|
||||
class TestRequireSupported:
|
||||
def test_allows_supported(self) -> None:
|
||||
for m in SUPPORTED_MIMES:
|
||||
require_supported(m) # no raise
|
||||
|
||||
def test_rejects_unsupported(self) -> None:
|
||||
with pytest.raises(IXException) as ei:
|
||||
require_supported("text/plain")
|
||||
assert ei.value.code is IXErrorCode.IX_000_005
|
||||
assert "text/plain" in (ei.value.detail or "")
|
||||
116
tests/unit/test_ingestion_pages.py
Normal file
116
tests/unit/test_ingestion_pages.py
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
"""Tests for DocumentIngestor.build_pages (spec §6.1)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from ix.errors import IXErrorCode, IXException
|
||||
from ix.ingestion import DocumentIngestor
|
||||
|
||||
|
||||
def _make_pdf_bytes(n_pages: int) -> bytes:
|
||||
import fitz
|
||||
|
||||
doc = fitz.open()
|
||||
for i in range(n_pages):
|
||||
page = doc.new_page(width=200, height=300)
|
||||
page.insert_text((10, 20), f"page {i+1}")
|
||||
out = doc.tobytes()
|
||||
doc.close()
|
||||
return out
|
||||
|
||||
|
||||
def _make_multi_frame_tiff_bytes(n_frames: int) -> bytes:
|
||||
from PIL import Image
|
||||
|
||||
frames = [Image.new("L", (10, 10), color=i * 30) for i in range(n_frames)]
|
||||
buf = BytesIO()
|
||||
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class TestPdf:
|
||||
def test_three_page_pdf_yields_three_pages(self, tmp_path: Path) -> None:
|
||||
p = tmp_path / "doc.pdf"
|
||||
p.write_bytes(_make_pdf_bytes(3))
|
||||
|
||||
ing = DocumentIngestor()
|
||||
pages, metas = ing.build_pages(files=[(p, "application/pdf")], texts=[])
|
||||
|
||||
assert len(pages) == 3
|
||||
for i, page in enumerate(pages, start=1):
|
||||
assert page.page_no == i
|
||||
assert page.width > 0
|
||||
assert page.height > 0
|
||||
assert len(metas) == 3
|
||||
for m in metas:
|
||||
assert m.file_index == 0
|
||||
|
||||
def test_page_count_cap_raises_IX_000_006(self, tmp_path: Path) -> None:
|
||||
p = tmp_path / "toomany.pdf"
|
||||
p.write_bytes(_make_pdf_bytes(101))
|
||||
ing = DocumentIngestor()
|
||||
with pytest.raises(IXException) as ei:
|
||||
ing.build_pages(files=[(p, "application/pdf")], texts=[])
|
||||
assert ei.value.code is IXErrorCode.IX_000_006
|
||||
|
||||
|
||||
class TestImages:
|
||||
def test_single_frame_png(self, tmp_path: Path) -> None:
|
||||
from PIL import Image
|
||||
|
||||
p = tmp_path / "img.png"
|
||||
Image.new("RGB", (50, 80), color="white").save(p, format="PNG")
|
||||
ing = DocumentIngestor()
|
||||
pages, metas = ing.build_pages(files=[(p, "image/png")], texts=[])
|
||||
assert len(pages) == 1
|
||||
assert pages[0].width == 50
|
||||
assert pages[0].height == 80
|
||||
assert metas[0].file_index == 0
|
||||
|
||||
def test_multi_frame_tiff_yields_multiple_pages(self, tmp_path: Path) -> None:
|
||||
p = tmp_path / "multi.tif"
|
||||
p.write_bytes(_make_multi_frame_tiff_bytes(2))
|
||||
|
||||
ing = DocumentIngestor()
|
||||
pages, metas = ing.build_pages(files=[(p, "image/tiff")], texts=[])
|
||||
assert len(pages) == 2
|
||||
for page in pages:
|
||||
assert page.width == 10
|
||||
assert page.height == 10
|
||||
# Both frames share the same file_index.
|
||||
assert {m.file_index for m in metas} == {0}
|
||||
|
||||
|
||||
class TestTexts:
|
||||
def test_texts_become_pages(self) -> None:
|
||||
ing = DocumentIngestor()
|
||||
pages, metas = ing.build_pages(files=[], texts=["hello", "world"])
|
||||
assert len(pages) == 2
|
||||
assert pages[0].page_no == 1
|
||||
assert pages[1].page_no == 2
|
||||
# Text-backed pages have no file_index source.
|
||||
assert metas[0].file_index is None
|
||||
assert metas[1].file_index is None
|
||||
|
||||
|
||||
class TestFileIndexes:
|
||||
def test_multi_file_indexes_are_contiguous(self, tmp_path: Path) -> None:
|
||||
p1 = tmp_path / "a.pdf"
|
||||
p1.write_bytes(_make_pdf_bytes(2))
|
||||
p2 = tmp_path / "b.pdf"
|
||||
p2.write_bytes(_make_pdf_bytes(1))
|
||||
|
||||
ing = DocumentIngestor()
|
||||
pages, metas = ing.build_pages(
|
||||
files=[(p1, "application/pdf"), (p2, "application/pdf")],
|
||||
texts=[],
|
||||
)
|
||||
assert len(pages) == 3
|
||||
# First two pages from file 0, last from file 1.
|
||||
assert metas[0].file_index == 0
|
||||
assert metas[1].file_index == 0
|
||||
assert metas[2].file_index == 1
|
||||
Loading…
Reference in a new issue