Three layered modules the SetupStep will wire together in Task 2.4. - fetch.py: async httpx fetch with configurable timeouts + incremental size cap (stream=True, accumulate bytes, raise IX_000_007 when exceeded). file:// URLs read locally. Auth headers pass through. The caller injects a FetchConfig — env reads happen in ix.config (Chunk 3). - mime.py: python-magic byte-sniff + SUPPORTED_MIMES frozenset + require_supported(mime) helper that raises IX_000_005. - pages.py: DocumentIngestor.build_pages(files, texts) -> (list[Page], list[PageMetadata]). PDFs via PyMuPDF (hard 100 pg/PDF cap -> IX_000_006), images via Pillow (multi-frame TIFFs yield multiple Pages), texts as zero-dim Pages so GenAIStep can still cite them. 21 new unit tests (141 total) cover: fetch success with headers, 4xx/5xx mapping, timeout -> IX_000_007, size cap enforced globally + per-file, file:// happy path + missing file, MIME detection for PDF/PNG/JPEG/TIFF, require_supported gate, PDF/TIFF/text page counts, 101-page PDF -> IX_000_006, multi-file file_index assignment. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
138 lines
4.8 KiB
Python
138 lines
4.8 KiB
Python
"""Tests for :func:`ix.ingestion.fetch.fetch_file` (spec §6.1)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import pytest
|
|
from pytest_httpx import HTTPXMock
|
|
|
|
from ix.contracts import FileRef
|
|
from ix.errors import IXErrorCode, IXException
|
|
from ix.ingestion import FetchConfig, fetch_file
|
|
|
|
|
|
@pytest.fixture
|
|
def cfg() -> FetchConfig:
|
|
return FetchConfig(
|
|
connect_timeout_s=1.0,
|
|
read_timeout_s=2.0,
|
|
max_bytes=1024 * 1024,
|
|
)
|
|
|
|
|
|
class TestSuccessPath:
|
|
async def test_downloads_with_auth_header_and_writes_to_tmp(
|
|
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://paperless.local/doc/123/download"
|
|
httpx_mock.add_response(
|
|
url=url,
|
|
method="GET",
|
|
status_code=200,
|
|
content=b"%PDF-1.4 body",
|
|
headers={"content-type": "application/pdf"},
|
|
)
|
|
file_ref = FileRef(
|
|
url=url,
|
|
headers={"Authorization": "Token abc"},
|
|
)
|
|
path = await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert path.exists()
|
|
assert path.read_bytes() == b"%PDF-1.4 body"
|
|
|
|
# Confirm header went out.
|
|
reqs = httpx_mock.get_requests()
|
|
assert len(reqs) == 1
|
|
assert reqs[0].headers["Authorization"] == "Token abc"
|
|
|
|
|
|
class TestNon2xx:
|
|
async def test_404_raises_IX_000_007(
|
|
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://host.local/missing.pdf"
|
|
httpx_mock.add_response(url=url, status_code=404, content=b"")
|
|
file_ref = FileRef(url=url)
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|
|
assert "404" in (ei.value.detail or "")
|
|
|
|
async def test_500_raises_IX_000_007(
|
|
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://host.local/boom.pdf"
|
|
httpx_mock.add_response(url=url, status_code=500, content=b"oops")
|
|
file_ref = FileRef(url=url)
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|
|
|
|
|
|
class TestTimeout:
|
|
async def test_timeout_raises_IX_000_007(
|
|
self, tmp_path: Path, cfg: FetchConfig, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://host.local/slow.pdf"
|
|
httpx_mock.add_exception(httpx.ReadTimeout("slow"), url=url)
|
|
file_ref = FileRef(url=url)
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|
|
|
|
|
|
class TestOversize:
|
|
async def test_oversize_raises_IX_000_007(
|
|
self, tmp_path: Path, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://host.local/big.pdf"
|
|
cfg = FetchConfig(
|
|
connect_timeout_s=1.0,
|
|
read_timeout_s=2.0,
|
|
max_bytes=100,
|
|
)
|
|
# 500 bytes of payload; cap is 100.
|
|
httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
|
|
file_ref = FileRef(url=url)
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|
|
|
|
async def test_per_file_max_bytes_override(
|
|
self, tmp_path: Path, httpx_mock: HTTPXMock
|
|
) -> None:
|
|
url = "https://host.local/mid.pdf"
|
|
cfg = FetchConfig(
|
|
connect_timeout_s=1.0,
|
|
read_timeout_s=2.0,
|
|
max_bytes=1_000_000,
|
|
)
|
|
# file_ref sets a tighter cap.
|
|
httpx_mock.add_response(url=url, status_code=200, content=b"x" * 500)
|
|
file_ref = FileRef(url=url, max_bytes=100)
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|
|
|
|
|
|
class TestFileUrl:
|
|
async def test_file_scheme_reads_local(
|
|
self, tmp_path: Path, cfg: FetchConfig
|
|
) -> None:
|
|
src = tmp_path / "in.pdf"
|
|
src.write_bytes(b"%PDF-1.4\nfile scheme content")
|
|
file_ref = FileRef(url=src.as_uri())
|
|
dst = await fetch_file(file_ref, tmp_dir=tmp_path / "out", cfg=cfg)
|
|
assert dst.exists()
|
|
assert dst.read_bytes() == b"%PDF-1.4\nfile scheme content"
|
|
|
|
async def test_file_scheme_missing_raises(
|
|
self, tmp_path: Path, cfg: FetchConfig
|
|
) -> None:
|
|
missing = tmp_path / "nope.pdf"
|
|
file_ref = FileRef(url=missing.as_uri())
|
|
with pytest.raises(IXException) as ei:
|
|
await fetch_file(file_ref, tmp_dir=tmp_path, cfg=cfg)
|
|
assert ei.value.code is IXErrorCode.IX_000_007
|