From 2e8ca0ee4331ee42992b86b9635cb8cad114b52d Mon Sep 17 00:00:00 2001 From: Dirk Riemann Date: Sat, 18 Apr 2026 21:27:54 +0200 Subject: [PATCH] feat(ui): add browser UI at /ui for job submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minimal Jinja2 + HTMX + Pico CSS UI (all CDN, no build step) that lets a user drop a PDF, pick a registered use case or define one inline, tweak OCR/GenAI/provenance options, submit, and watch the pretty-JSON result come back via 2s HTMX polling. Uploads land in {tmp_dir}/ui/.pdf via aiofiles streaming with the existing IX_FILE_MAX_BYTES cap. All submissions go through the same jobs_repo.insert_pending entry point the REST adapter uses — no duplicated logic. The REST surface is unchanged. Tests: tests/integration/test_ui_routes.py — 8 cases covering GET /ui, registered + custom use-case submissions (asserting the stored request carries use_case_inline for the custom path), malformed fields_json rejection, and the fragment renderer for pending vs. done. New deps pinned explicitly in pyproject.toml: jinja2, aiofiles, python-multipart (arrive transitively via FastAPI but we own the import surface now). Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 4 +- README.md | 6 +- pyproject.toml | 8 + src/ix/app.py | 13 + src/ix/ui/__init__.py | 13 + src/ix/ui/routes.py | 307 ++++++++++++++++++++++ src/ix/ui/static/.gitkeep | 0 src/ix/ui/templates/index.html | 188 ++++++++++++++ src/ix/ui/templates/job_fragment.html | 24 ++ tests/integration/test_ui_routes.py | 350 ++++++++++++++++++++++++++ uv.lock | 24 ++ 11 files changed, 934 insertions(+), 3 deletions(-) create mode 100644 src/ix/ui/__init__.py create mode 100644 src/ix/ui/routes.py create mode 100644 src/ix/ui/static/.gitkeep create mode 100644 src/ix/ui/templates/index.html create mode 100644 src/ix/ui/templates/job_fragment.html create mode 100644 tests/integration/test_ui_routes.py diff --git a/AGENTS.md b/AGENTS.md index 885745a..292cb19 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,9 +4,9 @@ Async, on-prem, LLM-powered structured information extraction microservice. Give Designed to be used by other on-prem services (e.g. mammon) as a reliable fallback / second opinion for format-specific deterministic parsers. -Status: MVP deployed (2026-04-18) at `http://192.168.68.42:8994` — LAN only. Full reference spec at `docs/spec-core-pipeline.md`; MVP spec at `docs/superpowers/specs/2026-04-18-ix-mvp-design.md`; deploy runbook at `docs/deployment.md`. +Status: MVP deployed (2026-04-18) at `http://192.168.68.42:8994` — LAN only. Browser UI at `http://192.168.68.42:8994/ui`. Full reference spec at `docs/spec-core-pipeline.md`; MVP spec at `docs/superpowers/specs/2026-04-18-ix-mvp-design.md`; deploy runbook at `docs/deployment.md`. -Use cases: the built-in registry lives in `src/ix/use_cases/__init__.py` (`bank_statement_header` for MVP). Callers without a registered entry can ship an ad-hoc schema inline via `RequestIX.use_case_inline` (see README "Ad-hoc use cases"); the pipeline builds the Pydantic classes on the fly per request. +Use cases: the built-in registry lives in `src/ix/use_cases/__init__.py` (`bank_statement_header` for MVP). Callers without a registered entry can ship an ad-hoc schema inline via `RequestIX.use_case_inline` (see README "Ad-hoc use cases"); the pipeline builds the Pydantic classes on the fly per request. The `/ui` page exposes this as a "custom" option so non-engineering users can experiment without a deploy. ## Guiding Principles diff --git a/README.md b/README.md index 7c065ae..0bac34a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,11 @@ Async, on-prem, LLM-powered structured information extraction microservice. Given a document (PDF, image, text) and a named *use case*, ix returns a structured JSON result whose shape matches the use-case schema — together with per-field provenance (OCR segment IDs, bounding boxes, cross-OCR agreement flags) that let the caller decide how much to trust each extracted value. -**Status:** MVP deployed. Live on the home LAN at `http://192.168.68.42:8994`. +**Status:** MVP deployed. Live on the home LAN at `http://192.168.68.42:8994` (REST API + browser UI at `/ui`). + +## Web UI + +A minimal browser UI lives at [`http://192.168.68.42:8994/ui`](http://192.168.68.42:8994/ui): drop a PDF, pick a registered use case or define one inline, submit, see the pretty-printed result. HTMX polls the job status every 2 s until the pipeline finishes. LAN-only, no auth. - Full reference spec: [`docs/spec-core-pipeline.md`](docs/spec-core-pipeline.md) (aspirational; MVP is a strict subset) - **MVP design:** [`docs/superpowers/specs/2026-04-18-ix-mvp-design.md`](docs/superpowers/specs/2026-04-18-ix-mvp-design.md) diff --git a/pyproject.toml b/pyproject.toml index fbf6c30..dbb3cb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,14 @@ dependencies = [ "pillow>=10.2,<11.0", "python-magic>=0.4.27", "python-dateutil>=2.9", + + # UI (HTMX + Jinja2 templates served from /ui). Both arrive as transitive + # deps via FastAPI/Starlette already, but we pin explicitly so the import + # surface is owned by us. python-multipart backs FastAPI's `Form()` / + # `UploadFile` parsing — required by `/ui/jobs` submissions. + "jinja2>=3.1", + "aiofiles>=24.1", + "python-multipart>=0.0.12", ] [project.optional-dependencies] diff --git a/src/ix/app.py b/src/ix/app.py index 42aa06a..79c921a 100644 --- a/src/ix/app.py +++ b/src/ix/app.py @@ -24,6 +24,7 @@ from contextlib import asynccontextmanager, suppress from typing import Literal from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles from ix.adapters.rest.routes import Probes, get_probes from ix.adapters.rest.routes import router as rest_router @@ -38,6 +39,8 @@ from ix.pipeline.pipeline import Pipeline from ix.pipeline.reliability_step import ReliabilityStep from ix.pipeline.response_handler_step import ResponseHandlerStep from ix.pipeline.setup_step import SetupStep +from ix.ui import build_router as build_ui_router +from ix.ui.routes import STATIC_DIR as UI_STATIC_DIR def build_pipeline( @@ -202,6 +205,16 @@ def create_app(*, spawn_worker: bool = True) -> FastAPI: app = FastAPI(lifespan=lifespan, title="infoxtractor", version="0.1.0") app.include_router(rest_router) + # Browser UI — additive, never touches the REST paths above. + app.include_router(build_ui_router()) + # Static assets for the UI. CDN-only for MVP so the directory is + # essentially empty, but the mount must exist so relative asset + # URLs resolve cleanly. + app.mount( + "/ui/static", + StaticFiles(directory=str(UI_STATIC_DIR)), + name="ui-static", + ) return app diff --git a/src/ix/ui/__init__.py b/src/ix/ui/__init__.py new file mode 100644 index 0000000..66577d3 --- /dev/null +++ b/src/ix/ui/__init__.py @@ -0,0 +1,13 @@ +"""Minimal browser UI served alongside the REST API at ``/ui``. + +The module is intentionally thin: templates + HTMX + Pico CSS (all from +CDNs, no build step). Uploads land in ``{cfg.tmp_dir}/ui/.pdf`` and +are submitted through the same :func:`ix.store.jobs_repo.insert_pending` +entry point the REST adapter uses — the UI does not duplicate that logic. +""" + +from __future__ import annotations + +from ix.ui.routes import build_router + +__all__ = ["build_router"] diff --git a/src/ix/ui/routes.py b/src/ix/ui/routes.py new file mode 100644 index 0000000..4d7daf2 --- /dev/null +++ b/src/ix/ui/routes.py @@ -0,0 +1,307 @@ +"""``/ui`` router — thin HTML wrapper over the existing jobs pipeline. + +Design notes: + +* Uploads stream to ``{cfg.tmp_dir}/ui/{uuid4()}.pdf`` via aiofiles; the + file persists for the lifetime of the ``ix_id`` (no cleanup cron — spec + deferred). +* The submission handler builds a :class:`RequestIX` (inline use case + supported) and inserts it via the same + :func:`ix.store.jobs_repo.insert_pending` the REST adapter uses. +* Responses are HTML. For HTMX-triggered submissions the handler returns + ``HX-Redirect`` so the whole page swaps; for plain form posts it returns + a 303 redirect. +* The fragment endpoint powers the polling loop: while the job is + pending/running, the fragment auto-refreshes every 2s via + ``hx-trigger="every 2s"``; when terminal, the trigger is dropped and the + pretty-printed response is rendered with highlight.js. +""" + +from __future__ import annotations + +import json +import uuid +from pathlib import Path +from typing import Annotated +from uuid import UUID + +import aiofiles +from fastapi import ( + APIRouter, + Depends, + File, + Form, + HTTPException, + Request, + UploadFile, +) +from fastapi.responses import HTMLResponse, RedirectResponse, Response +from fastapi.templating import Jinja2Templates +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker + +from ix.adapters.rest.routes import get_session_factory_dep +from ix.config import AppConfig, get_config +from ix.contracts.request import ( + Context, + FileRef, + GenAIOptions, + InlineUseCase, + OCROptions, + Options, + ProvenanceOptions, + RequestIX, + UseCaseFieldDef, +) +from ix.store import jobs_repo +from ix.use_cases import REGISTRY + +TEMPLATES_DIR = Path(__file__).parent / "templates" +STATIC_DIR = Path(__file__).parent / "static" + + +def _templates() -> Jinja2Templates: + """One Jinja env per process; cheap enough to build per DI call.""" + + return Jinja2Templates(directory=str(TEMPLATES_DIR)) + + +def _ui_tmp_dir(cfg: AppConfig) -> Path: + """Where uploads land. Created on first use; never cleaned up.""" + + d = Path(cfg.tmp_dir) / "ui" + d.mkdir(parents=True, exist_ok=True) + return d + + +def build_router() -> APIRouter: + """Return a fresh router. Kept as a factory so :mod:`ix.app` can wire DI.""" + + router = APIRouter(prefix="/ui", tags=["ui"]) + + @router.get("", response_class=HTMLResponse) + @router.get("/", response_class=HTMLResponse) + async def index(request: Request) -> Response: + tpl = _templates() + return tpl.TemplateResponse( + request, + "index.html", + { + "registered_use_cases": sorted(REGISTRY.keys()), + "job": None, + "form_error": None, + "form_values": {}, + }, + ) + + @router.get("/jobs/{job_id}", response_class=HTMLResponse) + async def job_page( + request: Request, + job_id: UUID, + session_factory: Annotated[ + async_sessionmaker[AsyncSession], Depends(get_session_factory_dep) + ], + ) -> Response: + async with session_factory() as session: + job = await jobs_repo.get(session, job_id) + if job is None: + raise HTTPException(status_code=404, detail="job not found") + tpl = _templates() + return tpl.TemplateResponse( + request, + "index.html", + { + "registered_use_cases": sorted(REGISTRY.keys()), + "job": job, + "form_error": None, + "form_values": {}, + }, + ) + + @router.get("/jobs/{job_id}/fragment", response_class=HTMLResponse) + async def job_fragment( + request: Request, + job_id: UUID, + session_factory: Annotated[ + async_sessionmaker[AsyncSession], Depends(get_session_factory_dep) + ], + ) -> Response: + async with session_factory() as session: + job = await jobs_repo.get(session, job_id) + if job is None: + raise HTTPException(status_code=404, detail="job not found") + response_json: str | None = None + if job.response is not None: + response_json = json.dumps( + job.response.model_dump(mode="json"), + indent=2, + sort_keys=True, + default=str, + ) + tpl = _templates() + return tpl.TemplateResponse( + request, + "job_fragment.html", + {"job": job, "response_json": response_json}, + ) + + @router.post("/jobs") + async def submit_job( + request: Request, + session_factory: Annotated[ + async_sessionmaker[AsyncSession], Depends(get_session_factory_dep) + ], + pdf: Annotated[UploadFile, File()], + use_case_name: Annotated[str, Form()], + use_case_mode: Annotated[str, Form()] = "registered", + texts: Annotated[str, Form()] = "", + ix_client_id: Annotated[str, Form()] = "ui", + request_id: Annotated[str, Form()] = "", + system_prompt: Annotated[str, Form()] = "", + default_model: Annotated[str, Form()] = "", + fields_json: Annotated[str, Form()] = "", + use_ocr: Annotated[str, Form()] = "", + ocr_only: Annotated[str, Form()] = "", + include_ocr_text: Annotated[str, Form()] = "", + include_geometries: Annotated[str, Form()] = "", + gen_ai_model_name: Annotated[str, Form()] = "", + include_provenance: Annotated[str, Form()] = "", + max_sources_per_field: Annotated[str, Form()] = "10", + ) -> Response: + cfg = get_config() + form_values = { + "use_case_mode": use_case_mode, + "use_case_name": use_case_name, + "ix_client_id": ix_client_id, + "request_id": request_id, + "texts": texts, + "system_prompt": system_prompt, + "default_model": default_model, + "fields_json": fields_json, + "use_ocr": use_ocr, + "ocr_only": ocr_only, + "include_ocr_text": include_ocr_text, + "include_geometries": include_geometries, + "gen_ai_model_name": gen_ai_model_name, + "include_provenance": include_provenance, + "max_sources_per_field": max_sources_per_field, + } + + def _rerender(error: str, status: int = 200) -> Response: + tpl = _templates() + return tpl.TemplateResponse( + request, + "index.html", + { + "registered_use_cases": sorted(REGISTRY.keys()), + "job": None, + "form_error": error, + "form_values": form_values, + }, + status_code=status, + ) + + # --- Inline use case (optional) --- + inline: InlineUseCase | None = None + if use_case_mode == "custom": + try: + raw_fields = json.loads(fields_json) + except json.JSONDecodeError as exc: + return _rerender(f"Invalid fields JSON: {exc}", status=422) + if not isinstance(raw_fields, list): + return _rerender( + "Invalid fields JSON: must be a list of field objects", + status=422, + ) + try: + parsed = [UseCaseFieldDef.model_validate(f) for f in raw_fields] + inline = InlineUseCase( + use_case_name=use_case_name, + system_prompt=system_prompt, + default_model=default_model or None, + fields=parsed, + ) + except Exception as exc: # pydantic ValidationError or similar + return _rerender( + f"Invalid inline use-case definition: {exc}", + status=422, + ) + + # --- PDF upload --- + upload_dir = _ui_tmp_dir(cfg) + target = upload_dir / f"{uuid.uuid4().hex}.pdf" + # Stream copy with a size cap matching IX_FILE_MAX_BYTES. + total = 0 + limit = cfg.file_max_bytes + async with aiofiles.open(target, "wb") as out: + while True: + chunk = await pdf.read(64 * 1024) + if not chunk: + break + total += len(chunk) + if total > limit: + # Drop the partial file; no stored state. + from contextlib import suppress + + with suppress(FileNotFoundError): + target.unlink() + return _rerender( + f"PDF exceeds IX_FILE_MAX_BYTES ({limit} bytes)", + status=413, + ) + await out.write(chunk) + + # --- Build RequestIX --- + ctx_texts: list[str] = [] + if texts.strip(): + ctx_texts = [texts.strip()] + + req_id = request_id.strip() or uuid.uuid4().hex + try: + request_ix = RequestIX( + use_case=use_case_name or "adhoc", + use_case_inline=inline, + ix_client_id=(ix_client_id.strip() or "ui"), + request_id=req_id, + context=Context( + files=[FileRef(url=f"file://{target.resolve()}")], + texts=ctx_texts, + ), + options=Options( + ocr=OCROptions( + use_ocr=_flag(use_ocr, default=True), + ocr_only=_flag(ocr_only, default=False), + include_ocr_text=_flag(include_ocr_text, default=False), + include_geometries=_flag(include_geometries, default=False), + ), + gen_ai=GenAIOptions( + gen_ai_model_name=(gen_ai_model_name.strip() or None), + ), + provenance=ProvenanceOptions( + include_provenance=_flag(include_provenance, default=True), + max_sources_per_field=int(max_sources_per_field or 10), + ), + ), + ) + except Exception as exc: + return _rerender(f"Invalid request: {exc}", status=422) + + async with session_factory() as session: + job = await jobs_repo.insert_pending( + session, request_ix, callback_url=None + ) + await session.commit() + + redirect_to = f"/ui/jobs/{job.job_id}" + if request.headers.get("HX-Request", "").lower() == "true": + return Response(status_code=200, headers={"HX-Redirect": redirect_to}) + return RedirectResponse(url=redirect_to, status_code=303) + + return router + + +def _flag(value: str, *, default: bool) -> bool: + """HTML forms omit unchecked checkboxes. Treat absence as ``default``.""" + + if value == "": + return default + return value.lower() in ("on", "true", "1", "yes") diff --git a/src/ix/ui/static/.gitkeep b/src/ix/ui/static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/ix/ui/templates/index.html b/src/ix/ui/templates/index.html new file mode 100644 index 0000000..0459327 --- /dev/null +++ b/src/ix/ui/templates/index.html @@ -0,0 +1,188 @@ + + + + + + infoxtractor UI + + + + + + + +
+
+

infoxtractor

+

Drop a PDF, pick or define a use case, run the pipeline.

+
+ + {% if form_error %} +
+

Form error: {{ form_error }}

+
+ {% endif %} + +
+
+ + + + +
+ Use case + + + + + +
+ + + +
+
+ +
+ Advanced options + + + +
+ OCR + + + + +
+ + + +
+ Provenance + + +
+
+ + +
+
+ + {% if job %} +
+
+ Job {{ job.job_id }} +
ix_id: {{ job.ix_id }} +
+
+ Loading… +
+
+ {% endif %} +
+ + + + diff --git a/src/ix/ui/templates/job_fragment.html b/src/ix/ui/templates/job_fragment.html new file mode 100644 index 0000000..645adb2 --- /dev/null +++ b/src/ix/ui/templates/job_fragment.html @@ -0,0 +1,24 @@ +{#- HTMX fragment rendered into #job-status on the results panel. + Pending/running → keep polling every 2s; terminal → render JSON. -#} +{% set terminal = job.status in ("done", "error") %} +
+

+ Status: {{ job.status }} + {% if not terminal %} + + {% endif %} +

+ + {% if terminal and response_json %} +
{{ response_json }}
+ {% elif terminal %} +

No response body.

+ {% endif %} +
diff --git a/tests/integration/test_ui_routes.py b/tests/integration/test_ui_routes.py new file mode 100644 index 0000000..3822277 --- /dev/null +++ b/tests/integration/test_ui_routes.py @@ -0,0 +1,350 @@ +"""Integration tests for the `/ui` router (spec §PR 2). + +Covers the full round-trip through `POST /ui/jobs` — the handler parses +multipart form data into a `RequestIX` and hands it to +`ix.store.jobs_repo.insert_pending`, the same entry point the REST adapter +uses. Tests assert the job row exists with the right client/request ids and +that custom-use-case forms produce a `use_case_inline` block in the stored +request JSON. + +The DB-touching tests depend on the shared integration conftest which +spins up migrations against the configured Postgres; the pure-template +tests (`GET /ui` and the fragment renderer) still need a factory but +won't actually query — they're cheap. +""" + +from __future__ import annotations + +import json +from collections.abc import Iterator +from pathlib import Path +from uuid import uuid4 + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy import select +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + +from ix.adapters.rest.routes import Probes, get_probes, get_session_factory_dep +from ix.app import create_app +from ix.store.models import IxJob + +FIXTURE_DIR = Path(__file__).resolve().parents[1] / "fixtures" +FIXTURE_PDF = FIXTURE_DIR / "synthetic_giro.pdf" + + +def _factory_for_url(postgres_url: str): # type: ignore[no-untyped-def] + def _factory(): # type: ignore[no-untyped-def] + eng = create_async_engine(postgres_url, pool_pre_ping=True) + return async_sessionmaker(eng, expire_on_commit=False) + + return _factory + + +@pytest.fixture +def app(postgres_url: str) -> Iterator[TestClient]: + app_obj = create_app(spawn_worker=False) + app_obj.dependency_overrides[get_session_factory_dep] = _factory_for_url( + postgres_url + ) + app_obj.dependency_overrides[get_probes] = lambda: Probes( + ollama=lambda: "ok", ocr=lambda: "ok" + ) + with TestClient(app_obj) as client: + yield client + + +class TestIndexPage: + def test_index_returns_html(self, app: TestClient) -> None: + resp = app.get("/ui") + assert resp.status_code == 200 + assert "text/html" in resp.headers["content-type"] + body = resp.text + # Dropdown prefilled with the registered use case. + assert "bank_statement_header" in body + # Marker for the submission form. + assert ' None: + # StaticFiles returns 404 for the keepfile; the mount itself must + # exist so asset URLs resolve. We probe the directory root instead. + resp = app.get("/ui/static/.gitkeep") + # .gitkeep exists in the repo — expect 200 (or at minimum not a 404 + # due to missing mount). A 405/403 would also indicate the mount is + # wired; we assert the response is *not* a 404 from a missing route. + assert resp.status_code != 404 + + +class TestSubmitJobRegistered: + def test_post_registered_use_case_creates_row( + self, + app: TestClient, + postgres_url: str, + ) -> None: + request_id = f"ui-reg-{uuid4().hex[:8]}" + with FIXTURE_PDF.open("rb") as fh: + resp = app.post( + "/ui/jobs", + data={ + "use_case_mode": "registered", + "use_case_name": "bank_statement_header", + "ix_client_id": "ui-test", + "request_id": request_id, + "texts": "", + "use_ocr": "on", + "include_provenance": "on", + "max_sources_per_field": "10", + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + follow_redirects=False, + ) + assert resp.status_code in (200, 303), resp.text + + # Assert the row exists in the DB. + job_row = _find_job(postgres_url, "ui-test", request_id) + assert job_row is not None + assert job_row.status == "pending" + assert job_row.request["use_case"] == "bank_statement_header" + # Context.files must reference a local file:// path. + files = job_row.request["context"]["files"] + assert len(files) == 1 + entry = files[0] + url = entry if isinstance(entry, str) else entry["url"] + assert url.startswith("file://") + + def test_htmx_submit_uses_hx_redirect_header( + self, + app: TestClient, + ) -> None: + request_id = f"ui-htmx-{uuid4().hex[:8]}" + with FIXTURE_PDF.open("rb") as fh: + resp = app.post( + "/ui/jobs", + data={ + "use_case_mode": "registered", + "use_case_name": "bank_statement_header", + "ix_client_id": "ui-test", + "request_id": request_id, + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + headers={"HX-Request": "true"}, + follow_redirects=False, + ) + assert resp.status_code == 200 + assert "HX-Redirect" in resp.headers + + +class TestSubmitJobCustom: + def test_post_custom_use_case_stores_inline( + self, + app: TestClient, + postgres_url: str, + ) -> None: + request_id = f"ui-cust-{uuid4().hex[:8]}" + fields_json = json.dumps( + [ + {"name": "vendor", "type": "str", "required": True}, + {"name": "total", "type": "decimal"}, + ] + ) + with FIXTURE_PDF.open("rb") as fh: + resp = app.post( + "/ui/jobs", + data={ + "use_case_mode": "custom", + "use_case_name": "invoice_adhoc", + "ix_client_id": "ui-test", + "request_id": request_id, + "system_prompt": "Extract vendor and total.", + "default_model": "qwen3:14b", + "fields_json": fields_json, + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + follow_redirects=False, + ) + assert resp.status_code in (200, 303), resp.text + job_row = _find_job(postgres_url, "ui-test", request_id) + assert job_row is not None + stored = job_row.request["use_case_inline"] + assert stored is not None + assert stored["use_case_name"] == "invoice_adhoc" + assert stored["system_prompt"] == "Extract vendor and total." + names = [f["name"] for f in stored["fields"]] + assert names == ["vendor", "total"] + + def test_post_malformed_fields_json_rejected( + self, + app: TestClient, + postgres_url: str, + ) -> None: + request_id = f"ui-bad-{uuid4().hex[:8]}" + with FIXTURE_PDF.open("rb") as fh: + resp = app.post( + "/ui/jobs", + data={ + "use_case_mode": "custom", + "use_case_name": "adhoc_bad", + "ix_client_id": "ui-test", + "request_id": request_id, + "system_prompt": "p", + "fields_json": "this is not json", + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + follow_redirects=False, + ) + # Either re-rendered form (422 / 200 with error) — what matters is + # that no row was inserted. + assert resp.status_code in (200, 400, 422) + job_row = _find_job(postgres_url, "ui-test", request_id) + assert job_row is None + # A helpful error should appear somewhere in the body. + assert ( + "error" in resp.text.lower() + or "invalid" in resp.text.lower() + or "json" in resp.text.lower() + ) + + +class TestFragment: + def test_fragment_pending_has_trigger( + self, + app: TestClient, + postgres_url: str, + ) -> None: + request_id = f"ui-frag-p-{uuid4().hex[:8]}" + with FIXTURE_PDF.open("rb") as fh: + app.post( + "/ui/jobs", + data={ + "use_case_mode": "registered", + "use_case_name": "bank_statement_header", + "ix_client_id": "ui-test", + "request_id": request_id, + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + follow_redirects=False, + ) + job_row = _find_job(postgres_url, "ui-test", request_id) + assert job_row is not None + + resp = app.get(f"/ui/jobs/{job_row.job_id}/fragment") + assert resp.status_code == 200 + body = resp.text + # Pending → auto-refresh every 2s. + assert "hx-trigger" in body + assert "2s" in body + assert "pending" in body.lower() or "running" in body.lower() + + def test_fragment_done_shows_pretty_json( + self, + app: TestClient, + postgres_url: str, + ) -> None: + request_id = f"ui-frag-d-{uuid4().hex[:8]}" + with FIXTURE_PDF.open("rb") as fh: + app.post( + "/ui/jobs", + data={ + "use_case_mode": "registered", + "use_case_name": "bank_statement_header", + "ix_client_id": "ui-test", + "request_id": request_id, + }, + files={"pdf": ("sample.pdf", fh, "application/pdf")}, + follow_redirects=False, + ) + job_row = _find_job(postgres_url, "ui-test", request_id) + assert job_row is not None + + # Hand-tick the row to done with a fake response. + _force_done( + postgres_url, + job_row.job_id, + response_body={ + "use_case": "bank_statement_header", + "ix_result": {"result": {"bank_name": "UBS AG", "currency": "CHF"}}, + }, + ) + + resp = app.get(f"/ui/jobs/{job_row.job_id}/fragment") + assert resp.status_code == 200 + body = resp.text + # Terminal → no auto-refresh. + assert "every 2s" not in body and "every 2s" not in body + # JSON present. + assert "UBS AG" in body + assert "CHF" in body + + +def _find_job(postgres_url: str, client_id: str, request_id: str): # type: ignore[no-untyped-def] + """Look up an ``ix_jobs`` row via the async engine, wrapping the coroutine + for test convenience.""" + + import asyncio + import json as _json + + async def _go(): # type: ignore[no-untyped-def] + eng = create_async_engine(postgres_url) + sf = async_sessionmaker(eng, expire_on_commit=False) + try: + async with sf() as session: + r = await session.scalar( + select(IxJob).where( + IxJob.client_id == client_id, + IxJob.request_id == request_id, + ) + ) + if r is None: + return None + + class _JobRow: + pass + + out = _JobRow() + out.job_id = r.job_id + out.client_id = r.client_id + out.request_id = r.request_id + out.status = r.status + if isinstance(r.request, str): + out.request = _json.loads(r.request) + else: + out.request = r.request + return out + finally: + await eng.dispose() + + return asyncio.run(_go()) + + +def _force_done( + postgres_url: str, + job_id, # type: ignore[no-untyped-def] + response_body: dict, +) -> None: + """Flip a pending job to ``done`` with the given response payload.""" + + import asyncio + from datetime import UTC, datetime + + from sqlalchemy import text + + async def _go(): # type: ignore[no-untyped-def] + eng = create_async_engine(postgres_url) + try: + async with eng.begin() as conn: + await conn.execute( + text( + "UPDATE ix_jobs SET status='done', " + "response=CAST(:resp AS JSONB), finished_at=:now " + "WHERE job_id=:jid" + ), + { + "resp": json.dumps(response_body), + "now": datetime.now(UTC), + "jid": str(job_id), + }, + ) + finally: + await eng.dispose() + + asyncio.run(_go()) diff --git a/uv.lock b/uv.lock index 30f96b5..eb0a191 100644 --- a/uv.lock +++ b/uv.lock @@ -7,6 +7,15 @@ resolution-markers = [ "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", ] +[[package]] +name = "aiofiles" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, +] + [[package]] name = "alembic" version = "1.18.4" @@ -521,16 +530,19 @@ name = "infoxtractor" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "aiofiles" }, { name = "alembic" }, { name = "asyncpg" }, { name = "fastapi" }, { name = "httpx" }, + { name = "jinja2" }, { name = "pillow" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pymupdf" }, { name = "python-dateutil" }, { name = "python-magic" }, + { name = "python-multipart" }, { name = "sqlalchemy", extra = ["asyncio"] }, { name = "uvicorn", extra = ["standard"] }, ] @@ -550,10 +562,12 @@ ocr = [ [package.metadata] requires-dist = [ + { name = "aiofiles", specifier = ">=24.1" }, { name = "alembic", specifier = ">=1.14" }, { name = "asyncpg", specifier = ">=0.30" }, { name = "fastapi", specifier = ">=0.115" }, { name = "httpx", specifier = ">=0.27" }, + { name = "jinja2", specifier = ">=3.1" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.13" }, { name = "pillow", specifier = ">=10.2,<11.0" }, { name = "pydantic", specifier = ">=2.9" }, @@ -564,6 +578,7 @@ requires-dist = [ { name = "pytest-httpx", marker = "extra == 'dev'", specifier = ">=0.32" }, { name = "python-dateutil", specifier = ">=2.9" }, { name = "python-magic", specifier = ">=0.4.27" }, + { name = "python-multipart", specifier = ">=0.0.12" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.8" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0.36" }, { name = "surya-ocr", marker = "extra == 'ocr'", specifier = ">=0.17,<0.18" }, @@ -1350,6 +1365,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840, upload-time = "2022-06-07T20:16:57.763Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.26" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" -- 2.45.2