From 95728accbfe39f5b9f592193be63d36979916d5b Mon Sep 17 00:00:00 2001 From: Dirk Riemann Date: Sat, 18 Apr 2026 11:38:44 +0200 Subject: [PATCH] =?UTF-8?q?feat(config):=20AppConfig=20+=20cached=20get=5F?= =?UTF-8?q?config()=20(spec=20=C2=A79)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Typed pydantic-settings view over every IX_* env var, defaults matching spec §9 exactly. @lru_cache-wrapped accessor so parsing/validation happens once per process; tests clear the cache via get_config.cache_clear(). extra="ignore" keeps the container robust against typo'd env vars in production .env files. engine.py's URL resolver now goes through get_config() when ix.config is importable (bootstrap fallback remains so hypothetical early-import callers don't crash). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ix/config.py | 75 ++++++++++++++++++++++ tests/unit/test_config.py | 131 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 src/ix/config.py create mode 100644 tests/unit/test_config.py diff --git a/src/ix/config.py b/src/ix/config.py new file mode 100644 index 0000000..8cfe322 --- /dev/null +++ b/src/ix/config.py @@ -0,0 +1,75 @@ +"""Application configuration — loaded from ``IX_*`` env vars via pydantic-settings. + +Spec §9 lists every tunable. This module is the single read-point for them; +callers that need runtime config should go through :func:`get_config` rather +than ``os.environ``. The LRU cache makes the first call materialise + validate +the full config and every subsequent call return the same instance. + +Cache-clearing is public (``get_config.cache_clear()``) because tests need to +re-read after ``monkeypatch.setenv``. Production code never clears the cache. +""" + +from __future__ import annotations + +from functools import lru_cache + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class AppConfig(BaseSettings): + """Typed view over the ``IX_*`` environment. + + Field names drop the ``IX_`` prefix — pydantic-settings puts it back via + ``env_prefix``. Defaults match the spec exactly; do not change a default + here without updating spec §9 in the same commit. + """ + + model_config = SettingsConfigDict( + env_prefix="IX_", + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + # --- Job store --- + postgres_url: str = ( + "postgresql+asyncpg://infoxtractor:" + "@host.docker.internal:5431/infoxtractor" + ) + + # --- LLM backend --- + ollama_url: str = "http://host.docker.internal:11434" + default_model: str = "gpt-oss:20b" + + # --- OCR --- + ocr_engine: str = "surya" + + # --- Pipeline behavior --- + pipeline_worker_concurrency: int = 1 + pipeline_request_timeout_seconds: int = 2700 + genai_call_timeout_seconds: int = 1500 + render_max_pixels_per_page: int = 75_000_000 + + # --- File fetching --- + tmp_dir: str = "/tmp/ix" + file_max_bytes: int = 52_428_800 + file_connect_timeout_seconds: int = 10 + file_read_timeout_seconds: int = 30 + + # --- Transport / callbacks --- + callback_timeout_seconds: int = 10 + + # --- Observability --- + log_level: str = "INFO" + + +@lru_cache(maxsize=1) +def get_config() -> AppConfig: + """Return the process-wide :class:`AppConfig` (materialise on first call). + + Wrapped in ``lru_cache`` so config is parsed + validated once per process. + Tests call ``get_config.cache_clear()`` between scenarios; nothing in + production should touch the cache. + """ + + return AppConfig() diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 0000000..7a2d5fb --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,131 @@ +"""Tests for :mod:`ix.config` — the pydantic-settings ``AppConfig``. + +Guardrails we care about: + +1. Every env var in spec §9 round-trips with the right type. +2. Defaults match the spec exactly when no env is set. +3. Unknown IX_ vars are ignored (``extra="ignore"``) so a typo doesn't crash + the container at startup. +4. ``get_config()`` is cached — same instance per process — and + ``get_config.cache_clear()`` rebuilds from the current environment (used by + every test here to keep them independent of process state). +""" + +from __future__ import annotations + +import pytest + +from ix.config import AppConfig, get_config + + +@pytest.fixture(autouse=True) +def _reset_config_cache() -> None: + """Flush the LRU cache around every test. + + Without this, tests that set env vars would see stale data from earlier + runs because ``get_config()`` caches the first materialised instance. + """ + + get_config.cache_clear() + + +def _clear_ix_env(monkeypatch: pytest.MonkeyPatch) -> None: + """Scrub every IX_* var so defaults surface predictably. + + Tests that exercise env-based overrides still call ``monkeypatch.setenv`` + after this to dial in specific values; tests for defaults rely on this + scrubbing so a developer's local ``.env`` can't contaminate the assertion. + """ + + import os + + for key in list(os.environ): + if key.startswith("IX_"): + monkeypatch.delenv(key, raising=False) + + +def test_defaults_match_spec(monkeypatch: pytest.MonkeyPatch) -> None: + _clear_ix_env(monkeypatch) + # Don't let pydantic-settings pick up the repo's .env.example. + cfg = AppConfig(_env_file=None) # type: ignore[call-arg] + + assert cfg.postgres_url == ( + "postgresql+asyncpg://infoxtractor:" + "@host.docker.internal:5431/infoxtractor" + ) + assert cfg.ollama_url == "http://host.docker.internal:11434" + assert cfg.default_model == "gpt-oss:20b" + assert cfg.ocr_engine == "surya" + assert cfg.tmp_dir == "/tmp/ix" + assert cfg.pipeline_worker_concurrency == 1 + assert cfg.pipeline_request_timeout_seconds == 2700 + assert cfg.genai_call_timeout_seconds == 1500 + assert cfg.file_max_bytes == 52428800 + assert cfg.file_connect_timeout_seconds == 10 + assert cfg.file_read_timeout_seconds == 30 + assert cfg.render_max_pixels_per_page == 75000000 + assert cfg.log_level == "INFO" + assert cfg.callback_timeout_seconds == 10 + + +def test_env_overrides(monkeypatch: pytest.MonkeyPatch) -> None: + _clear_ix_env(monkeypatch) + monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://u:p@db:5432/x") + monkeypatch.setenv("IX_OLLAMA_URL", "http://llm:11434") + monkeypatch.setenv("IX_DEFAULT_MODEL", "llama3:8b") + monkeypatch.setenv("IX_PIPELINE_WORKER_CONCURRENCY", "4") + monkeypatch.setenv("IX_GENAI_CALL_TIMEOUT_SECONDS", "60") + monkeypatch.setenv("IX_LOG_LEVEL", "DEBUG") + monkeypatch.setenv("IX_CALLBACK_TIMEOUT_SECONDS", "30") + + cfg = AppConfig(_env_file=None) # type: ignore[call-arg] + + assert cfg.postgres_url == "postgresql+asyncpg://u:p@db:5432/x" + assert cfg.ollama_url == "http://llm:11434" + assert cfg.default_model == "llama3:8b" + assert cfg.pipeline_worker_concurrency == 4 + assert cfg.genai_call_timeout_seconds == 60 + assert cfg.log_level == "DEBUG" + assert cfg.callback_timeout_seconds == 30 + + +def test_get_config_is_cached(monkeypatch: pytest.MonkeyPatch) -> None: + _clear_ix_env(monkeypatch) + monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d1") + first = get_config() + # Later mutation must NOT be seen until cache_clear — this is a feature, + # not a bug: config is process-level state, not per-call. + monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d2") + second = get_config() + assert first is second + assert second.postgres_url.endswith("/d1") + + get_config.cache_clear() + third = get_config() + assert third is not first + assert third.postgres_url.endswith("/d2") + + +def test_extra_env_keys_are_ignored(monkeypatch: pytest.MonkeyPatch) -> None: + """A typo'd IX_FOOBAR should not raise ValidationError at startup.""" + _clear_ix_env(monkeypatch) + monkeypatch.setenv("IX_FOOBAR", "whatever") + # Should not raise. + cfg = AppConfig(_env_file=None) # type: ignore[call-arg] + assert cfg.ollama_url.startswith("http://") + + +def test_engine_uses_config_url(monkeypatch: pytest.MonkeyPatch) -> None: + """``ix.store.engine`` reads the URL through ``AppConfig``. + + Task 3.2 refactors engine.py to go through ``get_config()`` instead of + reading ``os.environ`` directly. We can't actually construct an async + engine in a unit test (would need the DB), so we verify the resolution + function exists and returns the configured URL. + """ + _clear_ix_env(monkeypatch) + monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d") + + from ix.store.engine import _resolve_url + + assert _resolve_url() == "postgresql+asyncpg://a:b@c:5432/d"