Compare commits

...

2 commits

Author SHA1 Message Date
8bb220ae43 Merge pull request 'feat(config): AppConfig + cached get_config()' (#19) from feat/config into main
All checks were successful
tests / test (push) Successful in 59s
2026-04-18 09:39:00 +00:00
95728accbf feat(config): AppConfig + cached get_config() (spec §9)
All checks were successful
tests / test (push) Successful in 1m1s
tests / test (pull_request) Successful in 58s
Typed pydantic-settings view over every IX_* env var, defaults matching
spec §9 exactly. @lru_cache-wrapped accessor so parsing/validation happens
once per process; tests clear the cache via get_config.cache_clear().

extra="ignore" keeps the container robust against typo'd env vars in
production .env files. engine.py's URL resolver now goes through
get_config() when ix.config is importable (bootstrap fallback remains so
hypothetical early-import callers don't crash).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 11:38:44 +02:00
2 changed files with 206 additions and 0 deletions

75
src/ix/config.py Normal file
View file

@ -0,0 +1,75 @@
"""Application configuration — loaded from ``IX_*`` env vars via pydantic-settings.
Spec §9 lists every tunable. This module is the single read-point for them;
callers that need runtime config should go through :func:`get_config` rather
than ``os.environ``. The LRU cache makes the first call materialise + validate
the full config and every subsequent call return the same instance.
Cache-clearing is public (``get_config.cache_clear()``) because tests need to
re-read after ``monkeypatch.setenv``. Production code never clears the cache.
"""
from __future__ import annotations
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
class AppConfig(BaseSettings):
"""Typed view over the ``IX_*`` environment.
Field names drop the ``IX_`` prefix pydantic-settings puts it back via
``env_prefix``. Defaults match the spec exactly; do not change a default
here without updating spec §9 in the same commit.
"""
model_config = SettingsConfigDict(
env_prefix="IX_",
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# --- Job store ---
postgres_url: str = (
"postgresql+asyncpg://infoxtractor:<password>"
"@host.docker.internal:5431/infoxtractor"
)
# --- LLM backend ---
ollama_url: str = "http://host.docker.internal:11434"
default_model: str = "gpt-oss:20b"
# --- OCR ---
ocr_engine: str = "surya"
# --- Pipeline behavior ---
pipeline_worker_concurrency: int = 1
pipeline_request_timeout_seconds: int = 2700
genai_call_timeout_seconds: int = 1500
render_max_pixels_per_page: int = 75_000_000
# --- File fetching ---
tmp_dir: str = "/tmp/ix"
file_max_bytes: int = 52_428_800
file_connect_timeout_seconds: int = 10
file_read_timeout_seconds: int = 30
# --- Transport / callbacks ---
callback_timeout_seconds: int = 10
# --- Observability ---
log_level: str = "INFO"
@lru_cache(maxsize=1)
def get_config() -> AppConfig:
"""Return the process-wide :class:`AppConfig` (materialise on first call).
Wrapped in ``lru_cache`` so config is parsed + validated once per process.
Tests call ``get_config.cache_clear()`` between scenarios; nothing in
production should touch the cache.
"""
return AppConfig()

131
tests/unit/test_config.py Normal file
View file

@ -0,0 +1,131 @@
"""Tests for :mod:`ix.config` — the pydantic-settings ``AppConfig``.
Guardrails we care about:
1. Every env var in spec §9 round-trips with the right type.
2. Defaults match the spec exactly when no env is set.
3. Unknown IX_ vars are ignored (``extra="ignore"``) so a typo doesn't crash
the container at startup.
4. ``get_config()`` is cached same instance per process and
``get_config.cache_clear()`` rebuilds from the current environment (used by
every test here to keep them independent of process state).
"""
from __future__ import annotations
import pytest
from ix.config import AppConfig, get_config
@pytest.fixture(autouse=True)
def _reset_config_cache() -> None:
"""Flush the LRU cache around every test.
Without this, tests that set env vars would see stale data from earlier
runs because ``get_config()`` caches the first materialised instance.
"""
get_config.cache_clear()
def _clear_ix_env(monkeypatch: pytest.MonkeyPatch) -> None:
"""Scrub every IX_* var so defaults surface predictably.
Tests that exercise env-based overrides still call ``monkeypatch.setenv``
after this to dial in specific values; tests for defaults rely on this
scrubbing so a developer's local ``.env`` can't contaminate the assertion.
"""
import os
for key in list(os.environ):
if key.startswith("IX_"):
monkeypatch.delenv(key, raising=False)
def test_defaults_match_spec(monkeypatch: pytest.MonkeyPatch) -> None:
_clear_ix_env(monkeypatch)
# Don't let pydantic-settings pick up the repo's .env.example.
cfg = AppConfig(_env_file=None) # type: ignore[call-arg]
assert cfg.postgres_url == (
"postgresql+asyncpg://infoxtractor:<password>"
"@host.docker.internal:5431/infoxtractor"
)
assert cfg.ollama_url == "http://host.docker.internal:11434"
assert cfg.default_model == "gpt-oss:20b"
assert cfg.ocr_engine == "surya"
assert cfg.tmp_dir == "/tmp/ix"
assert cfg.pipeline_worker_concurrency == 1
assert cfg.pipeline_request_timeout_seconds == 2700
assert cfg.genai_call_timeout_seconds == 1500
assert cfg.file_max_bytes == 52428800
assert cfg.file_connect_timeout_seconds == 10
assert cfg.file_read_timeout_seconds == 30
assert cfg.render_max_pixels_per_page == 75000000
assert cfg.log_level == "INFO"
assert cfg.callback_timeout_seconds == 10
def test_env_overrides(monkeypatch: pytest.MonkeyPatch) -> None:
_clear_ix_env(monkeypatch)
monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://u:p@db:5432/x")
monkeypatch.setenv("IX_OLLAMA_URL", "http://llm:11434")
monkeypatch.setenv("IX_DEFAULT_MODEL", "llama3:8b")
monkeypatch.setenv("IX_PIPELINE_WORKER_CONCURRENCY", "4")
monkeypatch.setenv("IX_GENAI_CALL_TIMEOUT_SECONDS", "60")
monkeypatch.setenv("IX_LOG_LEVEL", "DEBUG")
monkeypatch.setenv("IX_CALLBACK_TIMEOUT_SECONDS", "30")
cfg = AppConfig(_env_file=None) # type: ignore[call-arg]
assert cfg.postgres_url == "postgresql+asyncpg://u:p@db:5432/x"
assert cfg.ollama_url == "http://llm:11434"
assert cfg.default_model == "llama3:8b"
assert cfg.pipeline_worker_concurrency == 4
assert cfg.genai_call_timeout_seconds == 60
assert cfg.log_level == "DEBUG"
assert cfg.callback_timeout_seconds == 30
def test_get_config_is_cached(monkeypatch: pytest.MonkeyPatch) -> None:
_clear_ix_env(monkeypatch)
monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d1")
first = get_config()
# Later mutation must NOT be seen until cache_clear — this is a feature,
# not a bug: config is process-level state, not per-call.
monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d2")
second = get_config()
assert first is second
assert second.postgres_url.endswith("/d1")
get_config.cache_clear()
third = get_config()
assert third is not first
assert third.postgres_url.endswith("/d2")
def test_extra_env_keys_are_ignored(monkeypatch: pytest.MonkeyPatch) -> None:
"""A typo'd IX_FOOBAR should not raise ValidationError at startup."""
_clear_ix_env(monkeypatch)
monkeypatch.setenv("IX_FOOBAR", "whatever")
# Should not raise.
cfg = AppConfig(_env_file=None) # type: ignore[call-arg]
assert cfg.ollama_url.startswith("http://")
def test_engine_uses_config_url(monkeypatch: pytest.MonkeyPatch) -> None:
"""``ix.store.engine`` reads the URL through ``AppConfig``.
Task 3.2 refactors engine.py to go through ``get_config()`` instead of
reading ``os.environ`` directly. We can't actually construct an async
engine in a unit test (would need the DB), so we verify the resolution
function exists and returns the configured URL.
"""
_clear_ix_env(monkeypatch)
monkeypatch.setenv("IX_POSTGRES_URL", "postgresql+asyncpg://a:b@c:5432/d")
from ix.store.engine import _resolve_url
assert _resolve_url() == "postgresql+asyncpg://a:b@c:5432/d"