"""SetupStep — request validation + file ingestion (spec §6.1). Runs first in the pipeline. Responsibilities: * Reject nonsense requests up front (``IX_000_000`` / ``IX_000_002``). * Normalise ``Context.files`` entries: plain ``str`` → ``FileRef(url=s, headers={})``. * Download every file in parallel (``asyncio.gather``) via the injected ``fetcher`` callable (default: :func:`ix.ingestion.fetch.fetch_file`). * Byte-sniff MIMEs; gate unsupported ones via ``IX_000_005``. * Load the use case pair from :data:`ix.use_cases.REGISTRY` (``IX_001_001`` on miss). * Hand the fetched files + raw texts to the injected ``ingestor`` so ``context.pages`` + ``context.page_metadata`` are ready for the OCRStep. Dependency-inject the fetcher/ingestor/mime-detector so unit tests stay hermetic — production wires the defaults via the app factory. """ from __future__ import annotations import asyncio from pathlib import Path from typing import Any, Protocol from ix.contracts import FileRef, RequestIX, ResponseIX from ix.contracts.response import _InternalContext from ix.errors import IXErrorCode, IXException from ix.ingestion import ( DocumentIngestor, FetchConfig, detect_mime, fetch_file, require_supported, ) from ix.pipeline.step import Step from ix.use_cases import get_use_case class _Fetcher(Protocol): """Async callable that downloads one file to ``tmp_dir``.""" async def __call__( self, file_ref: FileRef, tmp_dir: Path, cfg: FetchConfig ) -> Path: ... class _Ingestor(Protocol): """Turns fetched files + raw texts into a flat Page list.""" def build_pages( self, files: list[tuple[Path, str]], texts: list[str], ) -> tuple[list[Any], list[Any]]: ... class _MimeDetector(Protocol): """Returns the canonical MIME string for a local file.""" def __call__(self, path: Path) -> str: ... class SetupStep(Step): """First pipeline step: validate + fetch + MIME + use-case load + pages.""" def __init__( self, fetcher: _Fetcher | None = None, ingestor: _Ingestor | None = None, tmp_dir: Path | None = None, fetch_config: FetchConfig | None = None, mime_detector: _MimeDetector | None = None, ) -> None: self._fetcher: _Fetcher = fetcher or fetch_file self._ingestor: _Ingestor = ingestor or DocumentIngestor() self._tmp_dir = tmp_dir or Path("/tmp/ix") self._fetch_config = fetch_config self._mime_detector: _MimeDetector = mime_detector or detect_mime async def validate(self, request_ix: RequestIX, response_ix: ResponseIX) -> bool: if request_ix is None: # pragma: no cover - runtime sanity; Pydantic rejects it earlier raise IXException(IXErrorCode.IX_000_000) ctx = request_ix.context if not ctx.files and not ctx.texts: raise IXException(IXErrorCode.IX_000_002) return True async def process( self, request_ix: RequestIX, response_ix: ResponseIX ) -> ResponseIX: # 1. Load the use-case pair — early so an unknown name fails before # we waste time downloading files. use_case_request_cls, use_case_response_cls = get_use_case(request_ix.use_case) use_case_request = use_case_request_cls() # 2. Resolve the per-request scratch directory. ix_id is assigned # by the transport adapter — fall back to request_id for the MVP # unit tests that don't set ix_id. sub = request_ix.ix_id or request_ix.request_id request_tmp = self._tmp_dir / sub request_tmp.mkdir(parents=True, exist_ok=True) # 3. Normalise file entries to FileRef. Plain strings get empty headers. normalised: list[FileRef] = [ FileRef(url=entry, headers={}) if isinstance(entry, str) else entry for entry in request_ix.context.files ] # 4. Download in parallel. No retry — IX_000_007 propagates. fetch_cfg = self._resolve_fetch_config() local_paths: list[Path] = [] if normalised: local_paths = list( await asyncio.gather( *( self._fetcher(file_ref, request_tmp, fetch_cfg) for file_ref in normalised ) ) ) # 5. MIME-sniff each download; reject unsupported. files_with_mime: list[tuple[Path, str]] = [] for path in local_paths: mime = self._mime_detector(path) require_supported(mime) files_with_mime.append((path, mime)) # 6. Build the flat Page list + per-page metadata. pages, page_metadata = self._ingestor.build_pages( files=files_with_mime, texts=list(request_ix.context.texts), ) # 7. Stash everything on the internal context for downstream steps. response_ix.context = _InternalContext( pages=pages, files=files_with_mime, texts=list(request_ix.context.texts), use_case_request=use_case_request, use_case_response=use_case_response_cls, segment_index=None, page_metadata=page_metadata, tmp_dir=request_tmp, ) # 8. Echo use-case display name onto the public response. response_ix.use_case_name = getattr(use_case_request, "use_case_name", None) return response_ix def _resolve_fetch_config(self) -> FetchConfig: if self._fetch_config is not None: return self._fetch_config # Fallback defaults — used only when the caller didn't inject one. # Real values land via ix.config in Chunk 3. return FetchConfig( connect_timeout_s=10.0, read_timeout_s=30.0, max_bytes=50 * 1024 * 1024, ) __all__ = ["SetupStep"]