fix(genai): drop Ollama format flag; extract trailing JSON from response

qwen3:14b (and deepseek-r1, other reasoning models) wrap their output in <think>…</think> chains-of-thought before emitting real output. With format=json the constrained sampler terminated immediately at `{}` because the thinking block wasn't valid JSON; without format the model thinks normally and appends the actual JSON at the end. OllamaClient now omits the format flag and extracts the outermost balanced `{…}` block from the response (brace depth counter, string- literal aware). Works for reasoning models, ```json``` code-fenced outputs, and plain JSON alike. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 14:05:28 +02:00 · 2026-04-18 14:05:28 +02:00 · 81e3b9a7d0
commit 81e3b9a7d0
parent 763407ba1c
2 changed files with 50 additions and 6 deletions
--- a/src/ix/genai/ollama_client.py
+++ b/src/ix/genai/ollama_client.py
@ -96,8 +96,9 @@ class OllamaClient:
            ) from exc

        content = (payload.get("message") or {}).get("content") or ""
+        json_blob = _extract_json_blob(content)
        try:
-            parsed = response_schema.model_validate_json(content)
+            parsed = response_schema.model_validate_json(json_blob)
        except ValidationError as exc:
            raise IXException(
                IXErrorCode.IX_002_001,
@ -186,7 +187,12 @@ class OllamaClient:
            "model": request_kwargs.get("model"),
            "messages": messages,
            "stream": False,
-            "format": "json",
+            # NOTE: format is deliberately omitted. `format="json"` made
+            # reasoning models (qwen3) abort after emitting `{}` because the
+            # constrained sampler terminated before the chain-of-thought
+            # finished; `format=<schema>` segfaulted Ollama 0.11.8. Letting
+            # the model stream freely and then extracting the trailing JSON
+            # blob works for both reasoning and non-reasoning models.
        }

        options: dict[str, Any] = {}
@ -218,6 +224,43 @@ class OllamaClient:
        return out


+def _extract_json_blob(text: str) -> str:
+    """Return the outermost balanced JSON object in ``text``.
+
+    Reasoning models (qwen3, deepseek-r1) wrap their real answer in
+    ``<think>…</think>`` blocks. Other models sometimes prefix prose or
+    fence the JSON in ```json``` code blocks. Finding the last balanced
+    ``{…}`` is the cheapest robust parse that works for all three shapes;
+    a malformed response yields the full text and Pydantic catches it
+    downstream as ``IX_002_001``.
+    """
+    start = text.find("{")
+    if start < 0:
+        return text
+    depth = 0
+    in_string = False
+    escaped = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+        elif ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start : i + 1]
+    return text[start:]
+
+
 def _inject_schema_system_message(
    messages: list[dict[str, Any]],
    response_schema: type[BaseModel],
--- a/tests/unit/test_ollama_client.py
+++ b/tests/unit/test_ollama_client.py
@ -79,10 +79,11 @@ class TestInvokeHappyPath:
        body_json = json.loads(body)
        assert body_json["model"] == "gpt-oss:20b"
        assert body_json["stream"] is False
-        # format is "json" (loose mode): Ollama 0.11.8 segfaults on full
-        # Pydantic schemas. We pass the schema via the system prompt
-        # upstream and validate on parse.
-        assert body_json["format"] == "json"
+        # No `format` is sent: Ollama 0.11.8 segfaults on full schemas and
+        # aborts to `{}` with `format=json` on reasoning models. Schema is
+        # injected into the system prompt instead; we extract the trailing
+        # JSON blob from the response and validate via Pydantic.
+        assert "format" not in body_json
        assert body_json["options"]["temperature"] == 0.2
        assert "reasoning_effort" not in body_json
        # A schema-guidance system message is prepended to the caller's