fix: better json validation

2026-05-06 16:25:04 +00:00 · 2026-05-04 18:09:40 +01:00
parent 073f9fc003
commit 42c33f4e0d
3 changed files with 21 additions and 18 deletions
--- a/application/agents/tool_executor.py
+++ b/application/agents/tool_executor.py
@@ -413,10 +413,21 @@ class ToolExecutor:
            "action_name": llm_name,
            "arguments": call_args,
        }
+        tool_data = tools_dict[tool_id]
+        # Journal first so the reconciler sees malformed calls and any
+        # subsequent ``_mark_failed`` actually updates a real row.
+        proposed_ok = _record_proposed(
+            call_id,
+            tool_data["name"],
+            action_name,
+            call_args if isinstance(call_args, dict) else {},
+            tool_id=tool_data.get("id"),
+        )
        # Defensive guard: a non-dict ``call_args`` (e.g. malformed
        # JSON on the resume path) would crash the param walk below
        # with AttributeError on ``.items()``. Surface a clean error
-        # event and journal the failure instead of killing the stream.
+        # event and flip the journal row to ``failed`` instead of
+        # killing the stream.
        if not isinstance(call_args, dict):
            error_message = (
                f"Tool call arguments must be a JSON object, got "
@@ -432,17 +443,6 @@ class ToolExecutor:
            self.tool_calls.append(tool_call_data)
            return error_message, call_id
        yield {"type": "tool_call", "data": {**tool_call_data, "status": "pending"}}
-
-        tool_data = tools_dict[tool_id]
-        # Journal the call before any side effect runs so the
-        # reconciler can see attempts that crashed mid-execute.
-        proposed_ok = _record_proposed(
-            call_id,
-            tool_data["name"],
-            action_name,
-            call_args,
-            tool_id=tool_data.get("id"),
-        )
        action_data = (
            tool_data["config"]["actions"][action_name]
            if tool_data["name"] == "api_tool"
--- a/application/llm/handlers/google.py
+++ b/application/llm/handlers/google.py
@@ -19,10 +19,12 @@ def _decode_thought_signature(
    sig: Optional[Union[bytes, str]],
 ) -> Optional[Union[bytes, str]]:
    # Reverse of _encode_thought_signature — Gemini's SDK expects bytes
-    # back when we replay a tool call, so decode at egress.
+    # back when we replay a tool call. ``validate=True`` keeps ASCII
+    # strings that happen to be loosely decodable from being silently
+    # turned into bytes; non-base64 inputs pass through unchanged.
    if isinstance(sig, str):
        try:
-            return base64.b64decode(sig.encode("ascii"))
+            return base64.b64decode(sig.encode("ascii"), validate=True)
        except (binascii.Error, ValueError):
            return sig
    return sig
--- a/application/storage/db/serialization.py
+++ b/application/storage/db/serialization.py
@@ -66,13 +66,14 @@ def decode_base64_bytes(value: Any) -> Any:
    """Reverse ``coerce_pg_native``'s bytes-to-base64 step.

    Useful at egress points that need the original bytes back (e.g.
-    sending Gemini's ``thought_signature`` to the SDK on resume). Falls
-    back to passing through on non-base64 strings so a value that wasn't
-    actually encoded survives.
+    sending Gemini's ``thought_signature`` to the SDK on resume). Uses
+    ``validate=True`` so plain ASCII strings that happen to be
+    permissively decodable (e.g. ``"abcd"``) are not silently turned
+    into bytes — the original value passes through.
    """
    if isinstance(value, str):
        try:
-            return base64.b64decode(value.encode("ascii"))
+            return base64.b64decode(value.encode("ascii"), validate=True)
        except (binascii.Error, ValueError):
            return value
    return value