From 81e3b9a7d04808ff102d79b37a6e799db310ab4e Mon Sep 17 00:00:00 2001
From: Dirk Riemann <ditori@gmail.com>
Date: Sat, 18 Apr 2026 14:05:28 +0200
Subject: [PATCH] fix(genai): drop Ollama format flag; extract trailing JSON
 from response
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qwen3:14b (and deepseek-r1, other reasoning models) wrap their output
in <think>…</think> chains-of-thought before emitting real output.
With format=json the constrained sampler terminated immediately at
`{}` because the thinking block wasn't valid JSON; without format the
model thinks normally and appends the actual JSON at the end.

OllamaClient now omits the format flag and extracts the outermost
balanced `{…}` block from the response (brace depth counter, string-
literal aware). Works for reasoning models, ```json``` code-fenced
outputs, and plain JSON alike.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ix/genai/ollama_client.py    | 47 ++++++++++++++++++++++++++++++--
 tests/unit/test_ollama_client.py |  9 +++---
 2 files changed, 50 insertions(+), 6 deletions(-)
diff --git a/src/ix/genai/ollama_client.py b/src/ix/genai/ollama_client.py
index 0cc3783..d060b6e 100644
--- a/src/ix/genai/ollama_client.py
+++ b/src/ix/genai/ollama_client.py
@@ -96,8 +96,9 @@ class OllamaClient:
             ) from exc
 
         content = (payload.get("message") or {}).get("content") or ""
+        json_blob = _extract_json_blob(content)
         try:
-            parsed = response_schema.model_validate_json(content)
+            parsed = response_schema.model_validate_json(json_blob)
         except ValidationError as exc:
             raise IXException(
                 IXErrorCode.IX_002_001,
@@ -186,7 +187,12 @@ class OllamaClient:
             "model": request_kwargs.get("model"),
             "messages": messages,
             "stream": False,
-            "format": "json",
+            # NOTE: format is deliberately omitted. `format="json"` made
+            # reasoning models (qwen3) abort after emitting `{}` because the
+            # constrained sampler terminated before the chain-of-thought
+            # finished; `format=<schema>` segfaulted Ollama 0.11.8. Letting
+            # the model stream freely and then extracting the trailing JSON
+            # blob works for both reasoning and non-reasoning models.
         }
 
         options: dict[str, Any] = {}
@@ -218,6 +224,43 @@ class OllamaClient:
         return out
 
 
+def _extract_json_blob(text: str) -> str:
+    """Return the outermost balanced JSON object in ``text``.
+
+    Reasoning models (qwen3, deepseek-r1) wrap their real answer in
+    ``<think>…</think>`` blocks. Other models sometimes prefix prose or
+    fence the JSON in ```json``` code blocks. Finding the last balanced
+    ``{…}`` is the cheapest robust parse that works for all three shapes;
+    a malformed response yields the full text and Pydantic catches it
+    downstream as ``IX_002_001``.
+    """
+    start = text.find("{")
+    if start < 0:
+        return text
+    depth = 0
+    in_string = False
+    escaped = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+        elif ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start : i + 1]
+    return text[start:]
+
+
 def _inject_schema_system_message(
     messages: list[dict[str, Any]],
     response_schema: type[BaseModel],
diff --git a/tests/unit/test_ollama_client.py b/tests/unit/test_ollama_client.py
index 57f9f01..d6ee41f 100644
--- a/tests/unit/test_ollama_client.py
+++ b/tests/unit/test_ollama_client.py
@@ -79,10 +79,11 @@ class TestInvokeHappyPath:
         body_json = json.loads(body)
         assert body_json["model"] == "gpt-oss:20b"
         assert body_json["stream"] is False
-        # format is "json" (loose mode): Ollama 0.11.8 segfaults on full
-        # Pydantic schemas. We pass the schema via the system prompt
-        # upstream and validate on parse.
-        assert body_json["format"] == "json"
+        # No `format` is sent: Ollama 0.11.8 segfaults on full schemas and
+        # aborts to `{}` with `format=json` on reasoning models. Schema is
+        # injected into the system prompt instead; we extract the trailing
+        # JSON blob from the response and validate via Pydantic.
+        assert "format" not in body_json
         assert body_json["options"]["temperature"] == 0.2
         assert "reasoning_effort" not in body_json
         # A schema-guidance system message is prepended to the caller's