"""Async CRUD over ``ix_jobs`` — the one module the worker / REST touches. Every method takes an :class:`AsyncSession` (caller-owned transaction). The caller commits. We don't manage transactions inside repo methods because the worker sometimes needs to claim + run-pipeline + mark-done inside one long-running unit of work, and an inside-the-method commit would break that. A few invariants worth stating up front: * ``ix_id`` is a 16-char hex string assigned by :func:`insert_pending` on first insert. Callers MUST NOT pass one (we generate it); if a ``RequestIX`` arrives with ``ix_id`` set it is ignored. * ``(client_id, request_id)`` is unique — on collision we return the existing row unchanged. Callback URLs on the second insert are ignored; the first insert's metadata wins. * Claim uses ``FOR UPDATE SKIP LOCKED`` so concurrent workers never pick the same row, and a session holding a lock doesn't block a sibling claimer. * Status transitions: ``pending → running → (done | error)``. The sweeper is the only path back to ``pending`` (and only from ``running``); terminal states are stable. """ from __future__ import annotations import secrets from datetime import UTC, datetime from typing import TYPE_CHECKING, Literal from uuid import UUID, uuid4 from sqlalchemy import func, select, update from sqlalchemy.dialects.postgresql import insert as pg_insert from ix.contracts.job import Job from ix.contracts.request import RequestIX from ix.contracts.response import ResponseIX from ix.store.models import IxJob if TYPE_CHECKING: from sqlalchemy.ext.asyncio import AsyncSession def _new_ix_id() -> str: """Transport-assigned 16-hex handle. ``secrets.token_hex(8)`` gives 16 characters of entropy; good enough to tag logs per spec §3 without collision risk across the lifetime of the service. """ return secrets.token_hex(8) def _orm_to_job(row: IxJob) -> Job: """Round-trip ORM row back through the Pydantic ``Job`` contract. The JSONB columns come out as plain dicts; we let Pydantic re-validate them into :class:`RequestIX` / :class:`ResponseIX`. Catching validation errors here would mask real bugs; we let them surface. """ return Job( job_id=row.job_id, ix_id=row.ix_id, client_id=row.client_id, request_id=row.request_id, status=row.status, # type: ignore[arg-type] request=RequestIX.model_validate(row.request), response=( ResponseIX.model_validate(row.response) if row.response is not None else None ), callback_url=row.callback_url, callback_status=row.callback_status, # type: ignore[arg-type] attempts=row.attempts, created_at=row.created_at, started_at=row.started_at, finished_at=row.finished_at, ) async def insert_pending( session: AsyncSession, request: RequestIX, callback_url: str | None, ) -> Job: """Insert a pending row; return the new or existing :class:`Job`. Uses ``INSERT ... ON CONFLICT DO NOTHING`` on the ``(client_id, request_id)`` unique index, then re-selects. If the insert was a no-op the existing row is returned verbatim (status / callback_url unchanged) — callers rely on this for idempotent resubmission. """ ix_id = request.ix_id or _new_ix_id() job_id = uuid4() # Serialise the request through Pydantic so JSONB gets plain JSON types, # not datetime / Decimal instances asyncpg would reject. request_json = request.model_copy(update={"ix_id": ix_id}).model_dump( mode="json" ) stmt = ( pg_insert(IxJob) .values( job_id=job_id, ix_id=ix_id, client_id=request.ix_client_id, request_id=request.request_id, status="pending", request=request_json, response=None, callback_url=callback_url, callback_status=None, attempts=0, ) .on_conflict_do_nothing(index_elements=["client_id", "request_id"]) ) await session.execute(stmt) row = await session.scalar( select(IxJob).where( IxJob.client_id == request.ix_client_id, IxJob.request_id == request.request_id, ) ) assert row is not None, "insert_pending: row missing after upsert" return _orm_to_job(row) async def claim_next_pending(session: AsyncSession) -> Job | None: """Atomically pick the oldest pending row and flip it to running. ``FOR UPDATE SKIP LOCKED`` means a sibling worker can never deadlock on our row; they'll skip past it and grab the next pending entry. The sibling test in :mod:`tests/integration/test_jobs_repo` asserts this. """ stmt = ( select(IxJob) .where(IxJob.status == "pending") .order_by(IxJob.created_at) .limit(1) .with_for_update(skip_locked=True) ) row = await session.scalar(stmt) if row is None: return None row.status = "running" row.started_at = datetime.now(UTC) await session.flush() return _orm_to_job(row) async def get(session: AsyncSession, job_id: UUID) -> Job | None: row = await session.scalar(select(IxJob).where(IxJob.job_id == job_id)) return _orm_to_job(row) if row is not None else None async def get_by_correlation( session: AsyncSession, client_id: str, request_id: str ) -> Job | None: row = await session.scalar( select(IxJob).where( IxJob.client_id == client_id, IxJob.request_id == request_id, ) ) return _orm_to_job(row) if row is not None else None async def mark_done( session: AsyncSession, job_id: UUID, response: ResponseIX ) -> None: """Write the pipeline's response and move to terminal state. Status is ``done`` iff ``response.error is None``; any non-None error flips us to ``error``. Spec §3 lifecycle invariant. """ status = "done" if response.error is None else "error" await session.execute( update(IxJob) .where(IxJob.job_id == job_id) .values( status=status, response=response.model_dump(mode="json"), finished_at=datetime.now(UTC), ) ) async def mark_error( session: AsyncSession, job_id: UUID, response: ResponseIX ) -> None: """Convenience wrapper that always writes status='error'. Separate from :func:`mark_done` for readability at call sites: when the worker knows it caught an exception the pipeline didn't handle itself, ``mark_error`` signals intent even if the response body happens to have a populated error field. """ await session.execute( update(IxJob) .where(IxJob.job_id == job_id) .values( status="error", response=response.model_dump(mode="json"), finished_at=datetime.now(UTC), ) ) async def update_callback_status( session: AsyncSession, job_id: UUID, status: Literal["delivered", "failed"], ) -> None: await session.execute( update(IxJob) .where(IxJob.job_id == job_id) .values(callback_status=status) ) async def sweep_orphans( session: AsyncSession, now: datetime, max_running_seconds: int, ) -> list[UUID]: """Reset stale ``running`` rows back to ``pending`` and bump ``attempts``. Called once at worker startup (spec §3) to rescue jobs whose owner died mid-pipeline. The threshold is time-based on ``started_at`` so a still- running worker never reclaims its own in-flight job — callers pass ``2 * IX_PIPELINE_REQUEST_TIMEOUT_SECONDS`` per spec. """ # Pick candidates and return their ids so the worker can log what it # did. Two-step (SELECT then UPDATE) is clearer than RETURNING for # callers who want the id list alongside a plain UPDATE. candidates = ( await session.scalars( select(IxJob.job_id).where( IxJob.status == "running", IxJob.started_at < now - _as_interval(max_running_seconds), ) ) ).all() if not candidates: return [] await session.execute( update(IxJob) .where(IxJob.job_id.in_(candidates)) .values( status="pending", started_at=None, attempts=IxJob.attempts + 1, ) ) return list(candidates) def _as_interval(seconds: int): # type: ignore[no-untyped-def] """Return a SQL interval expression for ``seconds``. We build the interval via ``func.make_interval`` so asyncpg doesn't have to guess at a text-form cast — the server-side ``make_interval(secs :=)`` is unambiguous and avoids locale-dependent parsing. """ return func.make_interval(0, 0, 0, 0, 0, 0, seconds)