infoxtractor/src/ix/store/jobs_repo.py

"""Async CRUD over ``ix_jobs`` — the one module the worker / REST touches.

Every method takes an :class:`AsyncSession` (caller-owned transaction). The
caller commits. We don't manage transactions inside repo methods because the
worker sometimes needs to claim + run-pipeline + mark-done inside one
long-running unit of work, and an inside-the-method commit would break that.

A few invariants worth stating up front:

* ``ix_id`` is a 16-char hex string assigned by :func:`insert_pending` on
  first insert. Callers MUST NOT pass one (we generate it); if a
  ``RequestIX`` arrives with ``ix_id`` set it is ignored.
* ``(client_id, request_id)`` is unique — on collision we return the
  existing row unchanged. Callback URLs on the second insert are ignored;
  the first insert's metadata wins.
* Claim uses ``FOR UPDATE SKIP LOCKED`` so concurrent workers never pick the
  same row, and a session holding a lock doesn't block a sibling claimer.
* Status transitions: ``pending → running → (done | error)``. The sweeper is
  the only path back to ``pending`` (and only from ``running``); terminal
  states are stable.
"""

from __future__ import annotations

import secrets
from collections.abc import Iterable
from datetime import UTC, datetime
from typing import TYPE_CHECKING, Literal
from uuid import UUID, uuid4

from sqlalchemy import func, select, update
from sqlalchemy.dialects.postgresql import insert as pg_insert

from ix.contracts.job import Job
from ix.contracts.request import RequestIX
from ix.contracts.response import ResponseIX
from ix.store.models import IxJob

if TYPE_CHECKING:
    from sqlalchemy.ext.asyncio import AsyncSession


def _new_ix_id() -> str:
    """Transport-assigned 16-hex handle.

    ``secrets.token_hex(8)`` gives 16 characters of entropy; good enough to
    tag logs per spec §3 without collision risk across the lifetime of the
    service.
    """

    return secrets.token_hex(8)


def _orm_to_job(row: IxJob) -> Job:
    """Round-trip ORM row back through the Pydantic ``Job`` contract.

    The JSONB columns come out as plain dicts; we let Pydantic re-validate
    them into :class:`RequestIX` / :class:`ResponseIX`. Catching validation
    errors here would mask real bugs; we let them surface.
    """

    return Job(
        job_id=row.job_id,
        ix_id=row.ix_id,
        client_id=row.client_id,
        request_id=row.request_id,
        status=row.status,  # type: ignore[arg-type]
        request=RequestIX.model_validate(row.request),
        response=(
            ResponseIX.model_validate(row.response) if row.response is not None else None
        ),
        callback_url=row.callback_url,
        callback_status=row.callback_status,  # type: ignore[arg-type]
        attempts=row.attempts,
        created_at=row.created_at,
        started_at=row.started_at,
        finished_at=row.finished_at,
    )


async def insert_pending(
    session: AsyncSession,
    request: RequestIX,
    callback_url: str | None,
) -> Job:
    """Insert a pending row; return the new or existing :class:`Job`.

    Uses ``INSERT ... ON CONFLICT DO NOTHING`` on the
    ``(client_id, request_id)`` unique index, then re-selects. If the insert
    was a no-op the existing row is returned verbatim (status / callback_url
    unchanged) — callers rely on this for idempotent resubmission.
    """

    ix_id = request.ix_id or _new_ix_id()
    job_id = uuid4()

    # Serialise the request through Pydantic so JSONB gets plain JSON types,
    # not datetime / Decimal instances asyncpg would reject.
    request_json = request.model_copy(update={"ix_id": ix_id}).model_dump(
        mode="json"
    )

    stmt = (
        pg_insert(IxJob)
        .values(
            job_id=job_id,
            ix_id=ix_id,
            client_id=request.ix_client_id,
            request_id=request.request_id,
            status="pending",
            request=request_json,
            response=None,
            callback_url=callback_url,
            callback_status=None,
            attempts=0,
        )
        .on_conflict_do_nothing(index_elements=["client_id", "request_id"])
    )
    await session.execute(stmt)

    row = await session.scalar(
        select(IxJob).where(
            IxJob.client_id == request.ix_client_id,
            IxJob.request_id == request.request_id,
        )
    )
    assert row is not None, "insert_pending: row missing after upsert"
    return _orm_to_job(row)


async def claim_next_pending(session: AsyncSession) -> Job | None:
    """Atomically pick the oldest pending row and flip it to running.

    ``FOR UPDATE SKIP LOCKED`` means a sibling worker can never deadlock on
    our row; they'll skip past it and grab the next pending entry. The
    sibling test in :mod:`tests/integration/test_jobs_repo` asserts this.
    """

    stmt = (
        select(IxJob)
        .where(IxJob.status == "pending")
        .order_by(IxJob.created_at)
        .limit(1)
        .with_for_update(skip_locked=True)
    )
    row = await session.scalar(stmt)
    if row is None:
        return None

    row.status = "running"
    row.started_at = datetime.now(UTC)
    await session.flush()
    return _orm_to_job(row)


async def get(session: AsyncSession, job_id: UUID) -> Job | None:
    row = await session.scalar(select(IxJob).where(IxJob.job_id == job_id))
    return _orm_to_job(row) if row is not None else None


async def queue_position(
    session: AsyncSession, job_id: UUID
) -> tuple[int, int]:
    """Return ``(ahead, total_active)`` for a pending/running job.

    ``ahead`` counts active jobs (``pending`` or ``running``) that would be
    claimed by the worker before this one:

    * any ``running`` job is always ahead — it has the worker already.
    * other ``pending`` jobs with a strictly older ``created_at`` are ahead
      (the worker picks pending rows in ``ORDER BY created_at`` per
      :func:`claim_next_pending`).

    ``total_active`` is the total count of ``pending`` + ``running`` rows.

    Terminal jobs (``done`` / ``error``) always return ``(0, 0)`` — there is
    no meaningful "position" for a finished job.
    """

    row = await session.scalar(select(IxJob).where(IxJob.job_id == job_id))
    if row is None:
        return (0, 0)
    if row.status not in ("pending", "running"):
        return (0, 0)

    total_active = int(
        await session.scalar(
            select(func.count())
            .select_from(IxJob)
            .where(IxJob.status.in_(("pending", "running")))
        )
        or 0
    )

    if row.status == "running":
        # A running row is at the head of the queue for our purposes.
        return (0, total_active)

    # Pending: count running rows (always ahead) + older pending rows.
    # We tiebreak on ``job_id`` for deterministic ordering when multiple
    # rows share a ``created_at`` (e.g. the same transaction inserts two
    # jobs, which Postgres stamps with identical ``now()`` values).
    running_ahead = int(
        await session.scalar(
            select(func.count())
            .select_from(IxJob)
            .where(IxJob.status == "running")
        )
        or 0
    )
    pending_ahead = int(
        await session.scalar(
            select(func.count())
            .select_from(IxJob)
            .where(
                IxJob.status == "pending",
                (
                    (IxJob.created_at < row.created_at)
                    | (
                        (IxJob.created_at == row.created_at)
                        & (IxJob.job_id < row.job_id)
                    )
                ),
            )
        )
        or 0
    )
    return (running_ahead + pending_ahead, total_active)


async def get_by_correlation(
    session: AsyncSession, client_id: str, request_id: str
) -> Job | None:
    row = await session.scalar(
        select(IxJob).where(
            IxJob.client_id == client_id,
            IxJob.request_id == request_id,
        )
    )
    return _orm_to_job(row) if row is not None else None


async def mark_done(
    session: AsyncSession, job_id: UUID, response: ResponseIX
) -> None:
    """Write the pipeline's response and move to terminal state.

    Status is ``done`` iff ``response.error is None``; any non-None error
    flips us to ``error``. Spec §3 lifecycle invariant.
    """

    status = "done" if response.error is None else "error"
    await session.execute(
        update(IxJob)
        .where(IxJob.job_id == job_id)
        .values(
            status=status,
            response=response.model_dump(mode="json"),
            finished_at=datetime.now(UTC),
        )
    )


async def mark_error(
    session: AsyncSession, job_id: UUID, response: ResponseIX
) -> None:
    """Convenience wrapper that always writes status='error'.

    Separate from :func:`mark_done` for readability at call sites: when the
    worker knows it caught an exception the pipeline didn't handle itself,
    ``mark_error`` signals intent even if the response body happens to have
    a populated error field.
    """

    await session.execute(
        update(IxJob)
        .where(IxJob.job_id == job_id)
        .values(
            status="error",
            response=response.model_dump(mode="json"),
            finished_at=datetime.now(UTC),
        )
    )


async def update_callback_status(
    session: AsyncSession,
    job_id: UUID,
    status: Literal["delivered", "failed"],
) -> None:
    await session.execute(
        update(IxJob)
        .where(IxJob.job_id == job_id)
        .values(callback_status=status)
    )


async def sweep_orphans(
    session: AsyncSession,
    now: datetime,
    max_running_seconds: int,
) -> list[UUID]:
    """Reset stale ``running`` rows back to ``pending`` and bump ``attempts``.

    Called once at worker startup (spec §3) to rescue jobs whose owner died
    mid-pipeline. The threshold is time-based on ``started_at`` so a still-
    running worker never reclaims its own in-flight job — callers pass
    ``2 * IX_PIPELINE_REQUEST_TIMEOUT_SECONDS`` per spec.
    """

    # Pick candidates and return their ids so the worker can log what it
    # did. Two-step (SELECT then UPDATE) is clearer than RETURNING for
    # callers who want the id list alongside a plain UPDATE.
    candidates = (
        await session.scalars(
            select(IxJob.job_id).where(
                IxJob.status == "running",
                IxJob.started_at < now - _as_interval(max_running_seconds),
            )
        )
    ).all()
    if not candidates:
        return []

    await session.execute(
        update(IxJob)
        .where(IxJob.job_id.in_(candidates))
        .values(
            status="pending",
            started_at=None,
            attempts=IxJob.attempts + 1,
        )
    )
    return list(candidates)


_LIST_RECENT_LIMIT_CAP = 200


async def list_recent(
    session: AsyncSession,
    *,
    limit: int = 50,
    offset: int = 0,
    status: str | Iterable[str] | None = None,
    client_id: str | None = None,
) -> tuple[list[Job], int]:
    """Return a page of recent jobs, newest first, plus total matching count.

    Powers the ``/ui/jobs`` listing page. Ordering is ``created_at DESC``.
    ``total`` reflects matching rows *before* limit/offset so the template
    can render "showing N of M".

    Parameters
    ----------
    limit:
        Maximum rows to return. Capped at
        :data:`_LIST_RECENT_LIMIT_CAP` (200) to bound the JSON payload
        size — callers that pass a larger value get clamped silently.
    offset:
        Non-negative row offset. Negative values raise ``ValueError``
        because the template treats offset as a page cursor; a negative
        cursor is a bug at the call site, not something to paper over.
    status:
        If set, restrict to the given status(es). Accepts a single
        :data:`Job.status` value or any iterable (list/tuple/set). Values
        outside the lifecycle enum simply match nothing — we don't try
        to validate here; the DB CHECK constraint already bounds the set.
    client_id:
        If set, exact match on :attr:`IxJob.client_id`. No substring /
        prefix match — simple and predictable.
    """

    if offset < 0:
        raise ValueError(f"offset must be >= 0, got {offset}")
    effective_limit = max(0, min(limit, _LIST_RECENT_LIMIT_CAP))

    filters = []
    if status is not None:
        if isinstance(status, str):
            filters.append(IxJob.status == status)
        else:
            status_list = list(status)
            if not status_list:
                # Empty iterable → no rows match. Return a sentinel
                # IN-list that can never hit so we don't blow up.
                filters.append(IxJob.status.in_(status_list))
            else:
                filters.append(IxJob.status.in_(status_list))
    if client_id is not None:
        filters.append(IxJob.client_id == client_id)

    total_q = select(func.count()).select_from(IxJob)
    list_q = select(IxJob).order_by(IxJob.created_at.desc())
    for f in filters:
        total_q = total_q.where(f)
        list_q = list_q.where(f)

    total = int(await session.scalar(total_q) or 0)
    rows = (
        await session.scalars(list_q.limit(effective_limit).offset(offset))
    ).all()
    return [_orm_to_job(r) for r in rows], total


def _as_interval(seconds: int):  # type: ignore[no-untyped-def]
    """Return a SQL interval expression for ``seconds``.

    We build the interval via ``func.make_interval`` so asyncpg doesn't have
    to guess at a text-form cast — the server-side ``make_interval(secs :=)``
    is unambiguous and avoids locale-dependent parsing.
    """

    return func.make_interval(0, 0, 0, 0, 0, 0, seconds)