(feat:search_conversations) highlight matching snippet + frontend

(feat:search_conversations) add route
Throttle + debounce (#2458 )
2026-05-13 23:53:05 +00:00 · 2026-05-14 01:39:35 +05:30 · 2026-05-14 01:39:35 +05:30 · 2026-05-11 23:05:16 +01:00 · 2026-05-11 11:08:19 +01:00 · 2026-05-05 01:55:23 +01:00
130 changed files with 14471 additions and 859 deletions
--- a/application/agents/tool_executor.py
+++ b/application/agents/tool_executor.py
@@ -1,18 +1,107 @@
 import logging
 import uuid
 from collections import Counter
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from application.agents.tools.tool_action_parser import ToolActionParser
 from application.agents.tools.tool_manager import ToolManager
 from application.security.encryption import decrypt_credentials
+from application.storage.db.base_repository import looks_like_uuid
 from application.storage.db.repositories.agents import AgentsRepository
+from application.storage.db.repositories.tool_call_attempts import (
+    ToolCallAttemptsRepository,
+)
 from application.storage.db.repositories.user_tools import UserToolsRepository
-from application.storage.db.session import db_readonly
+from application.storage.db.session import db_readonly, db_session

 logger = logging.getLogger(__name__)


+def _record_proposed(
+    call_id: str,
+    tool_name: str,
+    action_name: str,
+    arguments: Any,
+    *,
+    tool_id: Optional[str] = None,
+) -> bool:
+    """Insert a ``proposed`` row; swallow infra failures so tool calls
+    still run when the journal is unreachable. Returns True iff the row
+    is now journaled (newly created or already present).
+    """
+    try:
+        with db_session() as conn:
+            inserted = ToolCallAttemptsRepository(conn).record_proposed(
+                call_id,
+                tool_name,
+                action_name,
+                arguments,
+                tool_id=tool_id if tool_id and looks_like_uuid(tool_id) else None,
+            )
+        if not inserted:
+            logger.warning(
+                "tool_call_attempts duplicate call_id=%s; existing row left in place",
+                call_id,
+                extra={"alert": "tool_call_id_collision", "call_id": call_id},
+            )
+        return True
+    except Exception:
+        logger.exception("tool_call_attempts proposed write failed for %s", call_id)
+        return False
+
+
+def _mark_executed(
+    call_id: str,
+    result: Any,
+    *,
+    message_id: Optional[str] = None,
+    artifact_id: Optional[str] = None,
+    proposed_ok: bool = True,
+    tool_name: Optional[str] = None,
+    action_name: Optional[str] = None,
+    arguments: Any = None,
+    tool_id: Optional[str] = None,
+) -> None:
+    """Flip the row to ``executed``. If ``proposed_ok`` is False (the
+    proposed write failed earlier), upsert a fresh row in ``executed`` so
+    the reconciler can still see the attempt — without this, the side
+    effect would be invisible to the journal.
+    """
+    try:
+        with db_session() as conn:
+            repo = ToolCallAttemptsRepository(conn)
+            if proposed_ok:
+                updated = repo.mark_executed(
+                    call_id,
+                    result,
+                    message_id=message_id,
+                    artifact_id=artifact_id,
+                )
+                if updated:
+                    return
+            # Fallback synthesizes the row so the journal isn't lost.
+            repo.upsert_executed(
+                call_id,
+                tool_name=tool_name or "unknown",
+                action_name=action_name or "",
+                arguments=arguments if arguments is not None else {},
+                result=result,
+                tool_id=tool_id if tool_id and looks_like_uuid(tool_id) else None,
+                message_id=message_id,
+                artifact_id=artifact_id,
+            )
+    except Exception:
+        logger.exception("tool_call_attempts executed write failed for %s", call_id)
+
+
+def _mark_failed(call_id: str, error: str) -> None:
+    try:
+        with db_session() as conn:
+            ToolCallAttemptsRepository(conn).mark_failed(call_id, error)
+    except Exception:
+        logger.exception("tool_call_attempts failed-write failed for %s", call_id)
+
+
 class ToolExecutor:
    """Handles tool discovery, preparation, and execution.

@@ -31,6 +120,7 @@ class ToolExecutor:
        self.tool_calls: List[Dict] = []
        self._loaded_tools: Dict[str, object] = {}
        self.conversation_id: Optional[str] = None
+        self.message_id: Optional[str] = None
        self.client_tools: Optional[List[Dict]] = None
        self._name_to_tool: Dict[str, Tuple[str, str]] = {}
        self._tool_to_name: Dict[Tuple[str, str], str] = {}
@@ -323,9 +413,36 @@ class ToolExecutor:
            "action_name": llm_name,
            "arguments": call_args,
        }
-        yield {"type": "tool_call", "data": {**tool_call_data, "status": "pending"}}
-
        tool_data = tools_dict[tool_id]
+        # Journal first so the reconciler sees malformed calls and any
+        # subsequent ``_mark_failed`` actually updates a real row.
+        proposed_ok = _record_proposed(
+            call_id,
+            tool_data["name"],
+            action_name,
+            call_args if isinstance(call_args, dict) else {},
+            tool_id=tool_data.get("id"),
+        )
+        # Defensive guard: a non-dict ``call_args`` (e.g. malformed
+        # JSON on the resume path) would crash the param walk below
+        # with AttributeError on ``.items()``. Surface a clean error
+        # event and flip the journal row to ``failed`` instead of
+        # killing the stream.
+        if not isinstance(call_args, dict):
+            error_message = (
+                f"Tool call arguments must be a JSON object, got "
+                f"{type(call_args).__name__}."
+            )
+            tool_call_data["result"] = error_message
+            tool_call_data["arguments"] = {}
+            _mark_failed(call_id, error_message)
+            yield {
+                "type": "tool_call",
+                "data": {**tool_call_data, "status": "error"},
+            }
+            self.tool_calls.append(tool_call_data)
+            return error_message, call_id
+        yield {"type": "tool_call", "data": {**tool_call_data, "status": "pending"}}
        action_data = (
            tool_data["config"]["actions"][action_name]
            if tool_data["name"] == "api_tool"
@@ -381,6 +498,7 @@ class ToolExecutor:
                },
            )
            tool_call_data["result"] = error_message
+            _mark_failed(call_id, error_message)
            yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}}
            self.tool_calls.append(tool_call_data)
            return error_message, call_id
@@ -390,14 +508,18 @@ class ToolExecutor:
            if tool_data["name"] == "api_tool"
            else parameters
        )
-        if tool_data["name"] == "api_tool":
-            logger.debug(
-                f"Executing api: {action_name} with query_params: {query_params}, headers: {headers}, body: {body}"
-            )
-            result = tool.execute_action(action_name, **body)
-        else:
-            logger.debug(f"Executing tool: {action_name} with args: {call_args}")
-            result = tool.execute_action(action_name, **parameters)
+        try:
+            if tool_data["name"] == "api_tool":
+                logger.debug(
+                    f"Executing api: {action_name} with query_params: {query_params}, headers: {headers}, body: {body}"
+                )
+                result = tool.execute_action(action_name, **body)
+            else:
+                logger.debug(f"Executing tool: {action_name} with args: {call_args}")
+                result = tool.execute_action(action_name, **parameters)
+        except Exception as exc:
+            _mark_failed(call_id, str(exc))
+            raise

        get_artifact_id = (
            getattr(tool, "get_artifact_id", None)
@@ -426,6 +548,22 @@ class ToolExecutor:
            f"{result_full[:50]}..." if len(result_full) > 50 else result_full
        )

+        # Tool side effect has run; flip the journal row so the
+        # message-finalize path can later confirm it. If the proposed
+        # write failed (DB outage), upsert a fresh row in ``executed`` so
+        # the reconciler still sees the side effect.
+        _mark_executed(
+            call_id,
+            result_full,
+            message_id=self.message_id,
+            artifact_id=artifact_id or None,
+            proposed_ok=proposed_ok,
+            tool_name=tool_data["name"],
+            action_name=action_name,
+            arguments=call_args,
+            tool_id=tool_data.get("id"),
+        )
+
        stream_tool_call_data = {
            key: value
            for key, value in tool_call_data.items()
--- a/application/agents/tools/postgres.py
+++ b/application/agents/tools/postgres.py
@@ -177,3 +177,4 @@ class PostgresTool(Tool):
                "order": 1,
            },
        }
+
--- a/application/agents/tools/tool_action_parser.py
+++ b/application/agents/tools/tool_action_parser.py
@@ -57,6 +57,29 @@ class ToolActionParser:
    def _parse_google_llm(self, call):
        try:
            call_args = call.arguments
+            # Gemini's SDK natively returns ``args`` as a dict, but the
+            # resume path (``gen_continuation``) stringifies it for the
+            # assistant message. Coerce a JSON string back into a dict;
+            # fall back to an empty dict on malformed input so downstream
+            # ``call_args.items()`` doesn't crash the stream.
+            if isinstance(call_args, str):
+                try:
+                    call_args = json.loads(call_args)
+                except (json.JSONDecodeError, TypeError):
+                    logger.warning(
+                        "Google call.arguments was not valid JSON; "
+                        "falling back to empty args for %s",
+                        getattr(call, "name", "<unknown>"),
+                    )
+                    call_args = {}
+            if not isinstance(call_args, dict):
+                logger.warning(
+                    "Google call.arguments has unexpected type %s; "
+                    "falling back to empty args for %s",
+                    type(call_args).__name__,
+                    getattr(call, "name", "<unknown>"),
+                )
+                call_args = {}

            resolved = self._resolve_via_mapping(call.name)
            if resolved:
--- a/application/alembic/versions/0004_durability_foundation.py
+++ b/application/alembic/versions/0004_durability_foundation.py
@@ -0,0 +1,217 @@
+"""0004 durability foundation — idempotency, tool-call log, ingest checkpoint.
+
+Adds ``task_dedup``, ``webhook_dedup``, ``tool_call_attempts``,
+``ingest_chunk_progress``, and per-row status flags on
+``conversation_messages`` and ``pending_tool_state``. Also adds
+``token_usage.source`` and ``token_usage.request_id`` so per-channel
+cost attribution (``agent_stream`` / ``title`` / ``compression`` /
+``rag_condense`` / ``fallback``) is queryable and multi-call agent runs
+can be DISTINCT-collapsed into a single user request for rate limiting.
+
+Revision ID: 0004_durability_foundation
+Revises: 0003_user_custom_models
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+revision: str = "0004_durability_foundation"
+down_revision: Union[str, None] = "0003_user_custom_models"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ------------------------------------------------------------------
+    # New tables
+    # ------------------------------------------------------------------
+    # ``attempt_count`` bounds the per-Celery-task idempotency wrapper's
+    # retry loop so a poison message can't run forever; default 0 means
+    # existing rows behave as if no attempts have run yet.
+    op.execute(
+        """
+        CREATE TABLE task_dedup (
+            idempotency_key TEXT PRIMARY KEY,
+            task_name       TEXT NOT NULL,
+            task_id         TEXT NOT NULL,
+            result_json     JSONB,
+            status          TEXT NOT NULL
+                            CHECK (status IN ('pending', 'completed', 'failed')),
+            attempt_count   INT  NOT NULL DEFAULT 0,
+            created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
+        );
+        """
+    )
+
+    op.execute(
+        """
+        CREATE TABLE webhook_dedup (
+            idempotency_key TEXT PRIMARY KEY,
+            agent_id        UUID NOT NULL,
+            task_id         TEXT NOT NULL,
+            response_json   JSONB,
+            created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
+        );
+        """
+    )
+
+    # FK on ``message_id`` uses ``ON DELETE SET NULL`` so the journal row
+    # survives parent-message deletion (compliance / cost-attribution).
+    op.execute(
+        """
+        CREATE TABLE tool_call_attempts (
+            call_id      TEXT PRIMARY KEY,
+            message_id   UUID
+                         REFERENCES conversation_messages (id)
+                         ON DELETE SET NULL,
+            tool_id      UUID,
+            tool_name    TEXT NOT NULL,
+            action_name  TEXT NOT NULL,
+            arguments    JSONB NOT NULL,
+            result       JSONB,
+            error        TEXT,
+            status       TEXT NOT NULL
+                         CHECK (status IN (
+                             'proposed', 'executed', 'confirmed',
+                             'compensated', 'failed'
+                         )),
+            attempted_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+            updated_at   TIMESTAMPTZ NOT NULL DEFAULT now()
+        );
+        """
+    )
+
+    op.execute(
+        """
+        CREATE TABLE ingest_chunk_progress (
+            source_id        UUID PRIMARY KEY,
+            total_chunks     INT NOT NULL,
+            embedded_chunks  INT NOT NULL DEFAULT 0,
+            last_index       INT NOT NULL DEFAULT -1,
+            last_updated     TIMESTAMPTZ NOT NULL DEFAULT now()
+        );
+        """
+    )
+
+    # ------------------------------------------------------------------
+    # Column additions on existing tables
+    # ------------------------------------------------------------------
+    # DEFAULT 'complete' backfills existing rows — they're already done.
+    op.execute(
+        """
+        ALTER TABLE conversation_messages
+            ADD COLUMN status TEXT NOT NULL DEFAULT 'complete'
+                CHECK (status IN ('pending', 'streaming', 'complete', 'failed')),
+            ADD COLUMN request_id TEXT;
+        """
+    )
+
+    op.execute(
+        """
+        ALTER TABLE pending_tool_state
+            ADD COLUMN status TEXT NOT NULL DEFAULT 'pending'
+                CHECK (status IN ('pending', 'resuming')),
+            ADD COLUMN resumed_at TIMESTAMPTZ;
+        """
+    )
+
+    # Default ``agent_stream`` backfills historical rows under the
+    # assumption they were written from the primary path — pre-fix the
+    # only path that wrote was the error branch reading agent.llm.
+    # ``request_id`` is the stream-scoped UUID stamped by the route on
+    # ``agent.llm`` so multi-tool agent runs (which produce N rows)
+    # collapse to one request via DISTINCT in ``count_in_range``.
+    # Side-channel sources (``title`` / ``compression`` / ``rag_condense``
+    # / ``fallback``) leave it NULL and are excluded from the request
+    # count by source filter.
+    op.execute(
+        """
+        ALTER TABLE token_usage
+            ADD COLUMN source     TEXT NOT NULL DEFAULT 'agent_stream',
+            ADD COLUMN request_id TEXT;
+        """
+    )
+
+    # ------------------------------------------------------------------
+    # Indexes — partial where the predicate selects only non-terminal rows
+    # ------------------------------------------------------------------
+    op.execute(
+        "CREATE INDEX conversation_messages_pending_ts_idx "
+        "ON conversation_messages (timestamp) "
+        "WHERE status IN ('pending', 'streaming');"
+    )
+    op.execute(
+        "CREATE INDEX tool_call_attempts_pending_ts_idx "
+        "ON tool_call_attempts (attempted_at) "
+        "WHERE status IN ('proposed', 'executed');"
+    )
+    op.execute(
+        "CREATE INDEX tool_call_attempts_message_idx "
+        "ON tool_call_attempts (message_id) "
+        "WHERE message_id IS NOT NULL;"
+    )
+    op.execute(
+        "CREATE INDEX pending_tool_state_resuming_ts_idx "
+        "ON pending_tool_state (resumed_at) "
+        "WHERE status = 'resuming';"
+    )
+    op.execute(
+        "CREATE INDEX webhook_dedup_agent_idx "
+        "ON webhook_dedup (agent_id);"
+    )
+    op.execute(
+        "CREATE INDEX task_dedup_pending_attempts_idx "
+        "ON task_dedup (attempt_count) WHERE status = 'pending';"
+    )
+    # Cost-attribution dashboards filter ``token_usage`` by
+    # ``(timestamp, source)``; index the same shape so they stay cheap.
+    op.execute(
+        "CREATE INDEX token_usage_source_ts_idx "
+        "ON token_usage (source, timestamp);"
+    )
+    # Partial index — only rows with a stamped request_id participate
+    # in the DISTINCT count. NULL rows fall through to the COUNT(*)
+    # branch in the repository query.
+    op.execute(
+        "CREATE INDEX token_usage_request_id_idx "
+        "ON token_usage (request_id) "
+        "WHERE request_id IS NOT NULL;"
+    )
+
+    op.execute(
+        "CREATE TRIGGER tool_call_attempts_set_updated_at "
+        "BEFORE UPDATE ON tool_call_attempts "
+        "FOR EACH ROW WHEN (OLD.* IS DISTINCT FROM NEW.*) "
+        "EXECUTE FUNCTION set_updated_at();"
+    )
+
+
+def downgrade() -> None:
+    # CASCADE so the downgrade stays safe if later migrations FK into these.
+    for table in (
+        "ingest_chunk_progress",
+        "tool_call_attempts",
+        "webhook_dedup",
+        "task_dedup",
+    ):
+        op.execute(f"DROP TABLE IF EXISTS {table} CASCADE;")
+
+    op.execute(
+        "ALTER TABLE conversation_messages "
+        "DROP COLUMN IF EXISTS request_id, "
+        "DROP COLUMN IF EXISTS status;"
+    )
+    op.execute(
+        "ALTER TABLE pending_tool_state "
+        "DROP COLUMN IF EXISTS resumed_at, "
+        "DROP COLUMN IF EXISTS status;"
+    )
+    op.execute("DROP INDEX IF EXISTS token_usage_request_id_idx;")
+    op.execute("DROP INDEX IF EXISTS token_usage_source_ts_idx;")
+    op.execute(
+        "ALTER TABLE token_usage "
+        "DROP COLUMN IF EXISTS request_id, "
+        "DROP COLUMN IF EXISTS source;"
+    )
--- a/application/alembic/versions/0005_ingest_attempt_id.py
+++ b/application/alembic/versions/0005_ingest_attempt_id.py
@@ -0,0 +1,44 @@
+"""0005 ingest_chunk_progress.attempt_id — per-attempt resume scoping.
+
+Without this column, a completed checkpoint row poisoned every later
+embed call on the same ``source_id``: a sync after an upload finished
+read the upload's terminal ``last_index`` and either embedded zero
+chunks (if new ``total_docs <= last_index + 1``) or stacked new chunks
+on top of the old vectors (if ``total_docs > last_index + 1``).
+
+``attempt_id`` is stamped from ``self.request.id`` (Celery's stable
+task id, which survives ``acks_late`` retries of the same task but
+differs across separate task invocations). The repository's
+``init_progress`` upsert resets ``last_index`` / ``embedded_chunks``
+when the incoming ``attempt_id`` differs from the stored one — so a
+fresh sync starts from chunk 0 while a retry of the same task resumes
+from the last checkpointed chunk.
+
+Revision ID: 0005_ingest_attempt_id
+Revises: 0004_durability_foundation
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+revision: str = "0005_ingest_attempt_id"
+down_revision: Union[str, None] = "0004_durability_foundation"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        ALTER TABLE ingest_chunk_progress
+            ADD COLUMN attempt_id TEXT;
+        """
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        "ALTER TABLE ingest_chunk_progress DROP COLUMN IF EXISTS attempt_id;"
+    )
--- a/application/alembic/versions/0006_idempotency_lease.py
+++ b/application/alembic/versions/0006_idempotency_lease.py
@@ -0,0 +1,57 @@
+"""0006 task_dedup lease columns — running-lease for in-flight tasks.
+
+Without these, ``with_idempotency`` only short-circuits *completed*
+rows. A late-ack redelivery (Redis ``visibility_timeout`` exceeded by a
+long ingest, or a hung-but-alive worker) hands the same message to a
+second worker; ``_claim_or_bump`` only bumped the attempt counter and
+both workers ran the task body in parallel — duplicate vector writes,
+duplicate token spend, duplicate webhook side effects.
+
+``lease_owner_id`` + ``lease_expires_at`` turn that into an atomic
+compare-and-swap. The wrapper claims a lease at entry, refreshes it via
+a 30 s heartbeat thread, and finalises (which makes the lease moot via
+``status='completed'``). A second worker hitting the same key sees a
+fresh lease and ``self.retry(countdown=LEASE_TTL)``s instead of running.
+A crashed worker's lease expires after ``LEASE_TTL`` seconds and the
+next retry can claim it.
+
+Revision ID: 0006_idempotency_lease
+Revises: 0005_ingest_attempt_id
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+revision: str = "0006_idempotency_lease"
+down_revision: Union[str, None] = "0005_ingest_attempt_id"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        ALTER TABLE task_dedup
+            ADD COLUMN lease_owner_id   TEXT,
+            ADD COLUMN lease_expires_at TIMESTAMPTZ;
+        """
+    )
+    # Reconciler's stuck-pending sweep filters by
+    # ``(status='pending', lease_expires_at < now() - 60s, attempt_count >= 5)``.
+    # Partial index keeps the scan small even under heavy task throughput.
+    op.execute(
+        "CREATE INDEX task_dedup_pending_lease_idx "
+        "ON task_dedup (lease_expires_at) "
+        "WHERE status = 'pending';"
+    )
+
+
+def downgrade() -> None:
+    op.execute("DROP INDEX IF EXISTS task_dedup_pending_lease_idx;")
+    op.execute(
+        "ALTER TABLE task_dedup "
+        "DROP COLUMN IF EXISTS lease_expires_at, "
+        "DROP COLUMN IF EXISTS lease_owner_id;"
+    )
--- a/application/api/answer/routes/answer.py
+++ b/application/api/answer/routes/answer.py
@@ -102,6 +102,8 @@ class AnswerResource(Resource, BaseAnswerResource):
                        "tools_dict": tools_dict,
                        "pending_tool_calls": pending_tool_calls,
                        "tool_actions": tool_actions,
+                        "reserved_message_id": processor.reserved_message_id,
+                        "request_id": processor.request_id,
                    },
                )
            else:
--- a/application/api/answer/routes/base.py
+++ b/application/api/answer/routes/base.py
@@ -1,13 +1,18 @@
 import datetime
 import json
 import logging
+import time
+import uuid
 from typing import Any, Dict, Generator, List, Optional

 from flask import jsonify, make_response, Response
 from flask_restx import Namespace

 from application.api.answer.services.continuation_service import ContinuationService
-from application.api.answer.services.conversation_service import ConversationService
+from application.api.answer.services.conversation_service import (
+    ConversationService,
+    TERMINATED_RESPONSE_PLACEHOLDER,
+)
 from application.core.model_utils import (
    get_api_key_for_provider,
    get_default_model_id,
@@ -203,13 +208,118 @@ class BaseAnswerResource:
        Yields:
            Server-sent event strings
        """
+        response_full, thought, source_log_docs, tool_calls = "", "", [], []
+        is_structured = False
+        schema_info = None
+        structured_chunks = []
+        query_metadata: Dict[str, Any] = {}
+        paused = False
+
+        # One id shared across the WAL row, primary LLM (token_usage
+        # attribution), the SSE event, and resumed continuations.
+        request_id = (
+            _continuation.get("request_id") if _continuation else None
+        ) or str(uuid.uuid4())
+
+        # Reserve the placeholder row before the LLM call so a crash
+        # mid-stream still leaves the question queryable. Continuations
+        # reuse the original placeholder.
+        reserved_message_id: Optional[str] = None
+        wal_eligible = should_save_conversation and not _continuation
+        if wal_eligible:
+            try:
+                reservation = self.conversation_service.save_user_question(
+                    conversation_id=conversation_id,
+                    question=question,
+                    decoded_token=decoded_token,
+                    attachment_ids=attachment_ids,
+                    api_key=user_api_key,
+                    agent_id=agent_id,
+                    is_shared_usage=is_shared_usage,
+                    shared_token=shared_token,
+                    model_id=model_id or self.default_model_id,
+                    request_id=request_id,
+                    index=index,
+                )
+                conversation_id = reservation["conversation_id"]
+                reserved_message_id = reservation["message_id"]
+            except Exception as e:
+                logger.error(
+                    f"Failed to reserve message row before stream: {e}",
+                    exc_info=True,
+                )
+        elif _continuation and _continuation.get("reserved_message_id"):
+            reserved_message_id = _continuation["reserved_message_id"]
+
+        primary_llm = getattr(agent, "llm", None)
+        if primary_llm is not None:
+            primary_llm._request_id = request_id
+
+        # Flipped to ``streaming`` on first chunk; reconciler uses this
+        # to tell "never started" from "in flight".
+        streaming_marked = False
+        # Heartbeat goes into ``metadata.last_heartbeat_at`` (not
+        # ``updated_at``, which reconciler-side writes share) and uses
+        # ``time.monotonic`` so a blocked event loop can't fake fresh.
+        STREAM_HEARTBEAT_INTERVAL = 60
+        last_heartbeat_at = time.monotonic()
+
+        def _mark_streaming_once() -> None:
+            nonlocal streaming_marked, last_heartbeat_at
+            if streaming_marked or not reserved_message_id:
+                return
+            try:
+                self.conversation_service.update_message_status(
+                    reserved_message_id, "streaming",
+                )
+            except Exception:
+                logger.exception(
+                    "update_message_status streaming failed for %s",
+                    reserved_message_id,
+                )
+            streaming_marked = True
+            last_heartbeat_at = time.monotonic()
+
+        def _heartbeat_streaming() -> None:
+            nonlocal last_heartbeat_at
+            if not reserved_message_id or not streaming_marked:
+                return
+            now_mono = time.monotonic()
+            if now_mono - last_heartbeat_at < STREAM_HEARTBEAT_INTERVAL:
+                return
+            try:
+                self.conversation_service.heartbeat_message(
+                    reserved_message_id,
+                )
+            except Exception:
+                logger.exception(
+                    "stream heartbeat update failed for %s",
+                    reserved_message_id,
+                )
+            last_heartbeat_at = now_mono
+
+        # Correlates tool_call_attempts rows with this message.
+        if reserved_message_id and getattr(agent, "tool_executor", None):
+            try:
+                agent.tool_executor.message_id = reserved_message_id
+            except Exception:
+                pass
+
        try:
-            response_full, thought, source_log_docs, tool_calls = "", "", [], []
-            is_structured = False
-            schema_info = None
-            structured_chunks = []
-            query_metadata = {}
-            paused = False
+            # Surface the placeholder id before any LLM tokens so a
+            # mid-handshake disconnect still has a row to tail-poll.
+            if reserved_message_id:
+                early_event = json.dumps(
+                    {
+                        "type": "message_id",
+                        "message_id": reserved_message_id,
+                        "conversation_id": (
+                            str(conversation_id) if conversation_id else None
+                        ),
+                        "request_id": request_id,
+                    }
+                )
+                yield f"data: {early_event}\n\n"

            if _continuation:
                gen_iter = agent.gen_continuation(
@@ -222,9 +332,13 @@ class BaseAnswerResource:
                gen_iter = agent.gen(query=question)

            for line in gen_iter:
+                # Cheap closure check that only hits the DB when the
+                # heartbeat interval has elapsed.
+                _heartbeat_streaming()
                if "metadata" in line:
                    query_metadata.update(line["metadata"])
                elif "answer" in line:
+                    _mark_streaming_once()
                    response_full += str(line["answer"])
                    if line.get("structured"):
                        is_structured = True
@@ -234,6 +348,7 @@ class BaseAnswerResource:
                        data = json.dumps({"type": "answer", "answer": line["answer"]})
                        yield f"data: {data}\n\n"
                elif "sources" in line:
+                    _mark_streaming_once()
                    truncated_sources = []
                    source_log_docs = line["sources"]
                    for source in line["sources"]:
@@ -286,12 +401,9 @@ class BaseAnswerResource:
            if paused:
                continuation = getattr(agent, "_pending_continuation", None)
                if continuation:
-                    # Ensure we have a conversation_id — create a partial
-                    # conversation if this is the first turn.
+                    # First-turn pause needs a conversation row to attach to.
                    if not conversation_id and should_save_conversation:
                        try:
-                            # Use model-owner scope so shared-agent
-                            # owner-BYOM resolves to its registered plugin.
                            provider = (
                                get_provider_from_model_id(
                                    model_id,
@@ -352,8 +464,8 @@ class BaseAnswerResource:
                                tool_schemas=getattr(agent, "tools", []),
                                agent_config={
                                    "model_id": model_id or self.default_model_id,
-                                    # Persist BYOM scope so resume doesn't
-                                    # fall back to caller's layer.
+                                    # BYOM scope; without it resume falls
+                                    # back to caller's layer.
                                    "model_user_id": model_user_id,
                                    "llm_name": getattr(agent, "llm_name", settings.LLM_PROVIDER),
                                    "api_key": getattr(agent, "api_key", None),
@@ -363,6 +475,11 @@ class BaseAnswerResource:
                                    "prompt": getattr(agent, "prompt", ""),
                                    "json_schema": getattr(agent, "json_schema", None),
                                    "retriever_config": getattr(agent, "retriever_config", None),
+                                    # Reused on resume so the same WAL row
+                                    # is finalised and request_id stays
+                                    # consistent across token_usage rows.
+                                    "reserved_message_id": reserved_message_id,
+                                    "request_id": request_id,
                                },
                                client_tools=getattr(
                                    agent.tool_executor, "client_tools", None
@@ -385,8 +502,7 @@ class BaseAnswerResource:
            if isNoneDoc:
                for doc in source_log_docs:
                    doc["source"] = "None"
-            # Run under model-owner scope so title-gen LLM inside
-            # save_conversation uses the owner's BYOM provider/key.
+            # Model-owner scope so title-gen uses owner's BYOM key.
            provider = (
                get_provider_from_model_id(
                    model_id,
@@ -407,26 +523,49 @@ class BaseAnswerResource:
                agent_id=agent_id,
                model_user_id=model_user_id,
            )
+            # Title-gen only; agent stream tokens live on ``agent.llm``.
+            llm._token_usage_source = "title"

            if should_save_conversation:
-                conversation_id = self.conversation_service.save_conversation(
-                    conversation_id,
-                    question,
-                    response_full,
-                    thought,
-                    source_log_docs,
-                    tool_calls,
-                    llm,
-                    model_id or self.default_model_id,
-                    decoded_token,
-                    index=index,
-                    api_key=user_api_key,
-                    agent_id=agent_id,
-                    is_shared_usage=is_shared_usage,
-                    shared_token=shared_token,
-                    attachment_ids=attachment_ids,
-                    metadata=query_metadata if query_metadata else None,
-                )
+                if reserved_message_id is not None:
+                    self.conversation_service.finalize_message(
+                        reserved_message_id,
+                        response_full,
+                        thought=thought,
+                        sources=source_log_docs,
+                        tool_calls=tool_calls,
+                        model_id=model_id or self.default_model_id,
+                        metadata=query_metadata if query_metadata else None,
+                        status="complete",
+                        title_inputs={
+                            "llm": llm,
+                            "question": question,
+                            "response": response_full,
+                            "model_id": model_id or self.default_model_id,
+                            "fallback_name": (
+                                question[:50] if question else "New Conversation"
+                            ),
+                        },
+                    )
+                else:
+                    conversation_id = self.conversation_service.save_conversation(
+                        conversation_id,
+                        question,
+                        response_full,
+                        thought,
+                        source_log_docs,
+                        tool_calls,
+                        llm,
+                        model_id or self.default_model_id,
+                        decoded_token,
+                        index=index,
+                        api_key=user_api_key,
+                        agent_id=agent_id,
+                        is_shared_usage=is_shared_usage,
+                        shared_token=shared_token,
+                        attachment_ids=attachment_ids,
+                        metadata=query_metadata if query_metadata else None,
+                    )
                # Persist compression metadata/summary if it exists and wasn't saved mid-execution
                compression_meta = getattr(agent, "compression_metadata", None)
                compression_saved = getattr(agent, "compression_saved", False)
@@ -449,6 +588,21 @@ class BaseAnswerResource:
                        )
            else:
                conversation_id = None
+            # Resume finished cleanly; drop the continuation row.
+            # Crash-paths leave it ``resuming`` for the janitor to revert.
+            if _continuation and conversation_id:
+                try:
+                    cont_service = ContinuationService()
+                    cont_service.delete_state(
+                        str(conversation_id),
+                        decoded_token.get("sub", "local"),
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Failed to delete continuation state on resume "
+                        f"completion: {e}",
+                        exc_info=True,
+                    )
            id_data = {"type": "id", "id": str(conversation_id)}
            data = json.dumps(id_data)
            yield f"data: {data}\n\n"
@@ -503,10 +657,8 @@ class BaseAnswerResource:
                    if isNoneDoc:
                        for doc in source_log_docs:
                            doc["source"] = "None"
-                    # Mirror the normal-path provider resolution so the
-                    # partial-save title LLM uses the model-owner's BYOM
-                    # registration (shared-agent dispatch) rather than
-                    # the deployment default with the instance api key.
+                    # Resolve under model-owner scope so shared-agent
+                    # title-gen uses owner BYOM, not deployment default.
                    provider = (
                        get_provider_from_model_id(
                            model_id,
@@ -532,24 +684,46 @@ class BaseAnswerResource:
                        agent_id=agent_id,
                        model_user_id=model_user_id,
                    )
-                    self.conversation_service.save_conversation(
-                        conversation_id,
-                        question,
-                        response_full,
-                        thought,
-                        source_log_docs,
-                        tool_calls,
-                        llm,
-                        model_id or self.default_model_id,
-                        decoded_token,
-                        index=index,
-                        api_key=user_api_key,
-                        agent_id=agent_id,
-                        is_shared_usage=is_shared_usage,
-                        shared_token=shared_token,
-                        attachment_ids=attachment_ids,
-                        metadata=query_metadata if query_metadata else None,
-                    )
+                    llm._token_usage_source = "title"
+                    if reserved_message_id is not None:
+                        self.conversation_service.finalize_message(
+                            reserved_message_id,
+                            response_full,
+                            thought=thought,
+                            sources=source_log_docs,
+                            tool_calls=tool_calls,
+                            model_id=model_id or self.default_model_id,
+                            metadata=query_metadata if query_metadata else None,
+                            status="complete",
+                            title_inputs={
+                                "llm": llm,
+                                "question": question,
+                                "response": response_full,
+                                "model_id": model_id or self.default_model_id,
+                                "fallback_name": (
+                                    question[:50] if question else "New Conversation"
+                                ),
+                            },
+                        )
+                    else:
+                        self.conversation_service.save_conversation(
+                            conversation_id,
+                            question,
+                            response_full,
+                            thought,
+                            source_log_docs,
+                            tool_calls,
+                            llm,
+                            model_id or self.default_model_id,
+                            decoded_token,
+                            index=index,
+                            api_key=user_api_key,
+                            agent_id=agent_id,
+                            is_shared_usage=is_shared_usage,
+                            shared_token=shared_token,
+                            attachment_ids=attachment_ids,
+                            metadata=query_metadata if query_metadata else None,
+                        )
                    compression_meta = getattr(agent, "compression_metadata", None)
                    compression_saved = getattr(agent, "compression_saved", False)
                    if conversation_id and compression_meta and not compression_saved:
@@ -576,6 +750,24 @@ class BaseAnswerResource:
            raise
        except Exception as e:
            logger.error(f"Error in stream: {str(e)}", exc_info=True)
+            if reserved_message_id is not None:
+                try:
+                    self.conversation_service.finalize_message(
+                        reserved_message_id,
+                        response_full or TERMINATED_RESPONSE_PLACEHOLDER,
+                        thought=thought,
+                        sources=source_log_docs,
+                        tool_calls=tool_calls,
+                        model_id=model_id or self.default_model_id,
+                        metadata=query_metadata if query_metadata else None,
+                        status="failed",
+                        error=e,
+                    )
+                except Exception as fin_err:
+                    logger.error(
+                        f"Failed to finalize errored message: {fin_err}",
+                        exc_info=True,
+                    )
            data = json.dumps(
                {
                    "type": "error",
--- a/application/api/answer/routes/stream.py
+++ b/application/api/answer/routes/stream.py
@@ -115,6 +115,8 @@ class StreamResource(Resource, BaseAnswerResource):
                            "tools_dict": tools_dict,
                            "pending_tool_calls": pending_tool_calls,
                            "tool_actions": tool_actions,
+                            "reserved_message_id": processor.reserved_message_id,
+                            "request_id": processor.request_id,
                        },
                    ),
                    mimetype="text/event-stream",
--- a/application/api/answer/services/compression/orchestrator.py
+++ b/application/api/answer/services/compression/orchestrator.py
@@ -160,6 +160,9 @@ class CompressionOrchestrator:
                agent_id=conversation.get("agent_id"),
                model_user_id=registry_user_id,
            )
+            # Side-channel LLM tag — distinguishes compression rows
+            # from primary stream rows for cost-attribution dashboards.
+            compression_llm._token_usage_source = "compression"

            # Create compression service with DB update capability
            compression_service = CompressionService(
--- a/application/api/answer/services/continuation_service.py
+++ b/application/api/answer/services/continuation_service.py
@@ -7,13 +7,13 @@ resume later by sending tool_actions.

 import logging
 from typing import Any, Dict, List, Optional
-from uuid import UUID

 from application.storage.db.base_repository import looks_like_uuid
 from application.storage.db.repositories.conversations import ConversationsRepository
 from application.storage.db.repositories.pending_tool_state import (
    PendingToolStateRepository,
 )
+from application.storage.db.serialization import coerce_pg_native as _make_serializable
 from application.storage.db.session import db_readonly, db_session

 logger = logging.getLogger(__name__)
@@ -21,23 +21,9 @@ logger = logging.getLogger(__name__)
 # TTL for pending states — auto-cleaned after this period
 PENDING_STATE_TTL_SECONDS = 30 * 60  # 30 minutes

-
-def _make_serializable(obj: Any) -> Any:
-    """Recursively coerce non-JSON values into JSON-safe forms.
-
-    Handles ``uuid.UUID`` (from PG columns), ``bytes``, and recurses into
-    dicts/lists. Post-Mongo-cutover the ObjectId branch is gone — none of
-    our writers produce them anymore.
-    """
-    if isinstance(obj, UUID):
-        return str(obj)
-    if isinstance(obj, dict):
-        return {str(k): _make_serializable(v) for k, v in obj.items()}
-    if isinstance(obj, list):
-        return [_make_serializable(v) for v in obj]
-    if isinstance(obj, bytes):
-        return obj.decode("utf-8", errors="replace")
-    return obj
+# Re-export so the existing tests at tests/api/answer/services/test_continuation_service_pg.py
+# can keep importing ``_make_serializable`` from here.
+__all__ = ["_make_serializable", "ContinuationService", "PENDING_STATE_TTL_SECONDS"]


 class ContinuationService:
@@ -155,3 +141,23 @@ class ContinuationService:
                f"Deleted continuation state for conversation {conversation_id}"
            )
        return deleted
+
+    def mark_resuming(self, conversation_id: str, user: str) -> bool:
+        """Flip the pending row to ``resuming`` so a crashed resume can be retried."""
+        with db_session() as conn:
+            conv = ConversationsRepository(conn).get_by_legacy_id(conversation_id)
+            if conv is not None:
+                pg_conv_id = conv["id"]
+            elif looks_like_uuid(conversation_id):
+                pg_conv_id = conversation_id
+            else:
+                return False
+            flipped = PendingToolStateRepository(conn).mark_resuming(
+                pg_conv_id, user
+            )
+        if flipped:
+            logger.info(
+                f"Marked continuation state as resuming for conversation "
+                f"{conversation_id}"
+            )
+        return flipped
--- a/application/api/answer/services/conversation_service.py
+++ b/application/api/answer/services/conversation_service.py
@@ -6,6 +6,7 @@ than held for the duration of a stream.
 """

 import logging
+import uuid
 from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional

@@ -21,6 +22,12 @@ from application.storage.db.session import db_readonly, db_session
 logger = logging.getLogger(__name__)


+# Shown to the user if the worker dies mid-stream and the response is never finalised.
+TERMINATED_RESPONSE_PLACEHOLDER = (
+    "Response was terminated prior to completion, try regenerating."
+)
+
+
 class ConversationService:
    def get_conversation(
        self, conversation_id: str, user_id: str
@@ -179,6 +186,236 @@ class ConversationService:
                repo.append_message(conv_pg_id, append_payload)
            return conv_pg_id

+    def save_user_question(
+        self,
+        conversation_id: Optional[str],
+        question: str,
+        decoded_token: Dict[str, Any],
+        *,
+        attachment_ids: Optional[List[str]] = None,
+        api_key: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        is_shared_usage: bool = False,
+        shared_token: Optional[str] = None,
+        model_id: Optional[str] = None,
+        request_id: Optional[str] = None,
+        status: str = "pending",
+        index: Optional[int] = None,
+    ) -> Dict[str, str]:
+        """Reserve the placeholder message row before the LLM call.
+
+        ``index`` triggers regenerate semantics: messages at
+        ``position >= index`` are truncated so the new placeholder
+        lands at ``position = index`` rather than appending.
+
+        Returns ``{"conversation_id", "message_id", "request_id"}``.
+        """
+        if decoded_token is None:
+            raise ValueError("Invalid or missing authentication token")
+        user_id = decoded_token.get("sub")
+        if not user_id:
+            raise ValueError("User ID not found in token")
+
+        request_id = request_id or str(uuid.uuid4())
+
+        resolved_api_key: Optional[str] = None
+        resolved_agent_id: Optional[str] = None
+        if api_key and not conversation_id:
+            with db_readonly() as conn:
+                agent = AgentsRepository(conn).find_by_key(api_key)
+            if agent:
+                resolved_api_key = agent.get("key")
+            if agent_id:
+                resolved_agent_id = agent_id
+
+        with db_session() as conn:
+            repo = ConversationsRepository(conn)
+            if conversation_id:
+                conv = repo.get_any(conversation_id, user_id)
+                if conv is None:
+                    raise ValueError("Conversation not found or unauthorized")
+                conv_pg_id = str(conv["id"])
+                # Regenerate / edit-prior-question: drop the message at
+                # ``index`` and everything after it so the new
+                # ``reserve_message`` lands at ``position=index`` rather
+                # than appending at the end of the conversation.
+                if isinstance(index, int) and index >= 0:
+                    repo.truncate_after(conv_pg_id, keep_up_to=index - 1)
+            else:
+                fallback_name = (question[:50] if question else "New Conversation")
+                conv = repo.create(
+                    user_id,
+                    fallback_name,
+                    agent_id=resolved_agent_id,
+                    api_key=resolved_api_key,
+                    is_shared_usage=bool(resolved_agent_id and is_shared_usage),
+                    shared_token=(
+                        shared_token
+                        if (resolved_agent_id and is_shared_usage)
+                        else None
+                    ),
+                )
+                conv_pg_id = str(conv["id"])
+
+            row = repo.reserve_message(
+                conv_pg_id,
+                prompt=question,
+                placeholder_response=TERMINATED_RESPONSE_PLACEHOLDER,
+                request_id=request_id,
+                status=status,
+                attachments=attachment_ids,
+                model_id=model_id,
+            )
+            message_id = str(row["id"])
+
+        return {
+            "conversation_id": conv_pg_id,
+            "message_id": message_id,
+            "request_id": request_id,
+        }
+
+    def update_message_status(self, message_id: str, status: str) -> bool:
+        """Cheap status-only transition (e.g. ``pending → streaming``)."""
+        if not message_id:
+            return False
+        with db_session() as conn:
+            return ConversationsRepository(conn).update_message_status(
+                message_id, status,
+            )
+
+    def heartbeat_message(self, message_id: str) -> bool:
+        """Bump ``message_metadata.last_heartbeat_at`` so the reconciler's
+        staleness sweep counts the row as alive. No-ops on terminal rows.
+        """
+        if not message_id:
+            return False
+        with db_session() as conn:
+            return ConversationsRepository(conn).heartbeat_message(message_id)
+
+    def finalize_message(
+        self,
+        message_id: str,
+        response: str,
+        *,
+        thought: str = "",
+        sources: Optional[List[Dict[str, Any]]] = None,
+        tool_calls: Optional[List[Dict[str, Any]]] = None,
+        model_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        status: str = "complete",
+        error: Optional[BaseException] = None,
+        title_inputs: Optional[Dict[str, Any]] = None,
+    ) -> bool:
+        """Commit the response and tool_call confirms in one transaction."""
+        if not message_id:
+            return False
+        sources = sources or []
+        for source in sources:
+            if "text" in source and isinstance(source["text"], str):
+                source["text"] = source["text"][:1000]
+
+        merged_metadata: Dict[str, Any] = dict(metadata or {})
+        if status == "failed" and error is not None:
+            merged_metadata.setdefault(
+                "error", f"{type(error).__name__}: {str(error)}"
+            )
+
+        update_fields: Dict[str, Any] = {
+            "response": response,
+            "status": status,
+            "thought": thought,
+            "sources": sources,
+            "tool_calls": tool_calls or [],
+            "metadata": merged_metadata,
+        }
+        if model_id is not None:
+            update_fields["model_id"] = model_id
+
+        # Atomic message update + tool_call_attempts confirm; the
+        # ``only_if_non_terminal`` guard prevents a late stream from
+        # retracting a row the reconciler already escalated.
+        with db_session() as conn:
+            repo = ConversationsRepository(conn)
+            ok = repo.update_message_by_id(
+                message_id, update_fields,
+                only_if_non_terminal=True,
+            )
+            if not ok:
+                logger.warning(
+                    f"finalize_message: no row updated for message_id={message_id} "
+                    f"(possibly already terminal — reconciler may have escalated)"
+                )
+                return False
+            repo.confirm_executed_tool_calls(message_id)
+
+        # Outside the txn — title-gen is a multi-second LLM round trip.
+        if title_inputs and status == "complete":
+            try:
+                with db_session() as conn:
+                    self._maybe_generate_title(conn, message_id, title_inputs)
+            except Exception as e:
+                logger.error(
+                    f"finalize_message title generation failed: {e}",
+                    exc_info=True,
+                )
+        return True
+
+    def _maybe_generate_title(
+        self,
+        conn,
+        message_id: str,
+        title_inputs: Dict[str, Any],
+    ) -> None:
+        """Generate an LLM-summarised conversation name if one isn't set yet."""
+        llm = title_inputs.get("llm")
+        question = title_inputs.get("question") or ""
+        response = title_inputs.get("response") or ""
+        fallback_name = title_inputs.get("fallback_name") or question[:50]
+        if llm is None:
+            return
+
+        row = conn.execute(
+            sql_text(
+                "SELECT c.id, c.name FROM conversation_messages m "
+                "JOIN conversations c ON c.id = m.conversation_id "
+                "WHERE m.id = CAST(:mid AS uuid)"
+            ),
+            {"mid": message_id},
+        ).fetchone()
+        if row is None:
+            return
+        conv_id, current_name = str(row[0]), row[1]
+        if current_name and current_name != fallback_name:
+            return
+
+        messages_summary = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that creates concise conversation titles. "
+                "Summarize conversations in 3 words or less using the same language as the user.",
+            },
+            {
+                "role": "user",
+                "content": "Summarise following conversation in no more than 3 words, "
+                "respond ONLY with the summary, use the same language as the "
+                "user query \n\nUser: " + question + "\n\n" + "AI: " + response,
+            },
+        ]
+        completion = llm.gen(
+            model=getattr(llm, "model_id", None) or title_inputs.get("model_id"),
+            messages=messages_summary,
+            max_tokens=500,
+        )
+        if not completion or not completion.strip():
+            completion = fallback_name or "New Conversation"
+        conn.execute(
+            sql_text(
+                "UPDATE conversations SET name = :name, updated_at = now() "
+                "WHERE id = CAST(:id AS uuid)"
+            ),
+            {"id": conv_id, "name": completion.strip()},
+        )
+
    def update_compression_metadata(
        self, conversation_id: str, compression_metadata: Dict[str, Any]
    ) -> None:
--- a/application/api/answer/services/stream_processor.py
+++ b/application/api/answer/services/stream_processor.py
@@ -123,6 +123,10 @@ class StreamProcessor:
        self.model_id: Optional[str] = None
        # BYOM-resolution scope, set by _validate_and_set_model.
        self.model_user_id: Optional[str] = None
+        # WAL placeholder id pulled from continuation state on resume.
+        self.reserved_message_id: Optional[str] = None
+        # Carried through resumes so multi-pause runs keep one request_id.
+        self.request_id: Optional[str] = None
        self.conversation_service = ConversationService()
        self.compression_orchestrator = CompressionOrchestrator(
            self.conversation_service
@@ -928,6 +932,20 @@ class StreamProcessor:
        if not state:
            raise ValueError("No pending tool state found for this conversation")

+        # Claim the resume up-front. ``mark_resuming`` only flips ``pending``
+        # → ``resuming``; if it returns False, another resume already
+        # claimed this row (status='resuming') — bail before any further
+        # LLM/tool work to avoid double-execution. The cleanup janitor
+        # reverts a stale ``resuming`` claim back to ``pending`` after the
+        # 10-minute grace window so the user can retry.
+        if not cont_service.mark_resuming(
+            conversation_id, self.initial_user_id,
+        ):
+            raise ValueError(
+                "Resume already in progress for this conversation; "
+                "retry after the grace window if it stalls."
+            )
+
        messages = state["messages"]
        pending_tool_calls = state["pending_tool_calls"]
        tools_dict = state["tools_dict"]
@@ -1022,9 +1040,10 @@ class StreamProcessor:
        self.agent_id = agent_id
        self.agent_config["user_api_key"] = user_api_key
        self.conversation_id = conversation_id
-
-        # Delete state so it can't be replayed
-        cont_service.delete_state(conversation_id, self.initial_user_id)
+        # Reused on resume so the same WAL row gets finalised and
+        # request_id stays consistent across token_usage rows.
+        self.reserved_message_id = agent_config.get("reserved_message_id")
+        self.request_id = agent_config.get("request_id")

        return agent, messages, tools_dict, pending_tool_calls, tool_actions

--- a/application/api/user/agents/routes.py
+++ b/application/api/user/agents/routes.py
@@ -46,7 +46,9 @@ AGENT_TYPE_SCHEMAS = {
            "prompt_id",
        ],
        "required_draft": ["name"],
-        "validate_published": ["name", "description", "prompt_id"],
+        # ``prompt_id`` intentionally omitted — the "default" sentinel
+        # is acceptable and maps to NULL downstream.
+        "validate_published": ["name", "description"],
        "validate_draft": [],
        "require_source": True,
        "fields": [
@@ -1009,12 +1011,16 @@ class UpdateAgent(Resource):
                                400,
                            )
                    else:
+                        # ``prompt_id`` is intentionally omitted: the
+                        # frontend's "default" choice maps to NULL here
+                        # (see the prompt_id branch above), and NULL
+                        # means "use the built-in default prompt" which
+                        # is a valid published-agent state.
                        missing_published_fields = []
                        for req_field, field_label in (
                            ("name", "Agent name"),
                            ("description", "Agent description"),
                            ("chunks", "Chunks count"),
-                            ("prompt_id", "Prompt"),
                            ("agent_type", "Agent type"),
                        ):
                            final_value = update_fields.get(
@@ -1028,8 +1034,23 @@ class UpdateAgent(Resource):
                        extra_final = update_fields.get(
                            "extra_source_ids", existing_agent.get("extra_source_ids") or [],
                        )
-                        if not source_final and not extra_final:
-                            missing_published_fields.append("Source")
+                        # ``retriever`` carries the runtime identity for
+                        # agents that publish against the synthetic
+                        # "Default" source (frontend's auto-selected
+                        # ``{name: "Default", retriever: "classic"}``
+                        # entry has no ``id``, so ``source_id`` ends up
+                        # NULL even though the user picked something).
+                        # Without this fallback the most common new-agent
+                        # publish flow gets a 400.
+                        retriever_final = update_fields.get(
+                            "retriever", existing_agent.get("retriever"),
+                        )
+                        if (
+                            not source_final
+                            and not extra_final
+                            and not retriever_final
+                        ):
+                            missing_published_fields.append("Source or retriever")
                        if missing_published_fields:
                            return make_response(
                                jsonify(
--- a/application/api/user/agents/webhooks.py
+++ b/application/api/user/agents/webhooks.py
@@ -1,15 +1,19 @@
 """Agent management webhook handlers."""

 import secrets
+import uuid

 from flask import current_app, jsonify, make_response, request
 from flask_restx import Namespace, Resource
+from sqlalchemy import text as sql_text

 from application.api import api
 from application.api.user.base import require_agent
 from application.api.user.tasks import process_agent_webhook
 from application.core.settings import settings
+from application.storage.db.base_repository import looks_like_uuid
 from application.storage.db.repositories.agents import AgentsRepository
+from application.storage.db.repositories.idempotency import IdempotencyRepository
 from application.storage.db.session import db_readonly, db_session


@@ -18,6 +22,37 @@ agents_webhooks_ns = Namespace(
 )


+_IDEMPOTENCY_KEY_MAX_LEN = 256
+
+
+def _read_idempotency_key():
+    """Return (key, error_response). Empty header → (None, None); oversized → (None, 400)."""
+    key = request.headers.get("Idempotency-Key")
+    if not key:
+        return None, None
+    if len(key) > _IDEMPOTENCY_KEY_MAX_LEN:
+        return None, make_response(
+            jsonify(
+                {
+                    "success": False,
+                    "message": (
+                        f"Idempotency-Key exceeds maximum length of "
+                        f"{_IDEMPOTENCY_KEY_MAX_LEN} characters"
+                    ),
+                }
+            ),
+            400,
+        )
+    return key, None
+
+
+def _scoped_idempotency_key(idempotency_key, scope):
+    """``{scope}:{key}`` so different agents can't collide on the same key."""
+    if not idempotency_key or not scope:
+        return None
+    return f"{scope}:{idempotency_key}"
+
+
@agents_webhooks_ns.route("/agent_webhook")
 class AgentWebhook(Resource):
    @api.doc(
@@ -68,7 +103,7 @@ class AgentWebhook(Resource):
 class AgentWebhookListener(Resource):
    method_decorators = [require_agent]

-    def _enqueue_webhook_task(self, agent_id_str, payload, source_method):
+    def _enqueue_webhook_task(self, agent_id_str, payload, source_method, agent=None):
        if not payload:
            current_app.logger.warning(
                f"Webhook ({source_method}) received for agent {agent_id_str} with empty payload."
@@ -77,26 +112,94 @@ class AgentWebhookListener(Resource):
            f"Incoming {source_method} webhook for agent {agent_id_str}. Enqueuing task with payload: {payload}"
        )

-        try:
-            task = process_agent_webhook.delay(
-                agent_id=agent_id_str,
-                payload=payload,
+        idempotency_key, key_error = _read_idempotency_key()
+        if key_error is not None:
+            return key_error
+        # Resolve to PG UUID first so dedup writes don't crash on legacy ids.
+        agent_uuid = None
+        if agent is not None:
+            candidate = str(agent.get("id") or "")
+            if looks_like_uuid(candidate):
+                agent_uuid = candidate
+        if idempotency_key and agent_uuid is None:
+            current_app.logger.warning(
+                "Skipping webhook idempotency dedup: agent %s has non-UUID id",
+                agent_id_str,
            )
+            idempotency_key = None
+        # Agent-scoped (webhooks have no user_id).
+        scoped_key = _scoped_idempotency_key(idempotency_key, agent_uuid)
+        # Claim before enqueue; the loser returns the winner's task_id.
+        predetermined_task_id = None
+        if scoped_key:
+            predetermined_task_id = str(uuid.uuid4())
+            with db_session() as conn:
+                claimed = IdempotencyRepository(conn).record_webhook(
+                    key=scoped_key,
+                    agent_id=agent_uuid,
+                    task_id=predetermined_task_id,
+                    response_json={
+                        "success": True, "task_id": predetermined_task_id,
+                    },
+                )
+            if claimed is None:
+                with db_readonly() as conn:
+                    cached = IdempotencyRepository(conn).get_webhook(scoped_key)
+                if cached is not None:
+                    return make_response(jsonify(cached["response_json"]), 200)
+                return make_response(
+                    jsonify({"success": True, "task_id": "deduplicated"}), 200
+                )
+
+        try:
+            apply_kwargs = dict(
+                kwargs={
+                    "agent_id": agent_id_str,
+                    "payload": payload,
+                    # Scoped so the worker dedup row matches the HTTP claim.
+                    "idempotency_key": scoped_key or idempotency_key,
+                },
+            )
+            if predetermined_task_id is not None:
+                apply_kwargs["task_id"] = predetermined_task_id
+            task = process_agent_webhook.apply_async(**apply_kwargs)
            current_app.logger.info(
                f"Task {task.id} enqueued for agent {agent_id_str} ({source_method})."
            )
-            return make_response(jsonify({"success": True, "task_id": task.id}), 200)
+            response_payload = {"success": True, "task_id": task.id}
+            return make_response(jsonify(response_payload), 200)
        except Exception as err:
            current_app.logger.error(
                f"Error enqueuing webhook task ({source_method}) for agent {agent_id_str}: {err}",
                exc_info=True,
            )
+            if scoped_key:
+                # Roll back the claim so a retry can succeed.
+                try:
+                    with db_session() as conn:
+                        conn.execute(
+                            sql_text(
+                                "DELETE FROM webhook_dedup "
+                                "WHERE idempotency_key = :k"
+                            ),
+                            {"k": scoped_key},
+                        )
+                except Exception:
+                    current_app.logger.exception(
+                        "Failed to release webhook_dedup claim for key=%s",
+                        scoped_key,
+                    )
            return make_response(
                jsonify({"success": False, "message": "Error processing webhook"}), 500
            )

    @api.doc(
-        description="Webhook listener for agent events (POST). Expects JSON payload, which is used to trigger processing.",
+        description=(
+            "Webhook listener for agent events (POST). Expects JSON payload, which "
+            "is used to trigger processing. Honors an optional ``Idempotency-Key`` "
+            "header: a repeat request with the same key within 24h returns the "
+            "original cached response and does not re-enqueue the task."
+        ),
    )
    def post(self, webhook_token, agent, agent_id_str):
        payload = request.get_json()
@@ -110,11 +213,20 @@ class AgentWebhookListener(Resource):
                ),
                400,
            )
-        return self._enqueue_webhook_task(agent_id_str, payload, source_method="POST")
+        return self._enqueue_webhook_task(
+            agent_id_str, payload, source_method="POST", agent=agent,
+        )

    @api.doc(
-        description="Webhook listener for agent events (GET). Uses URL query parameters as payload to trigger processing.",
+        description=(
+            "Webhook listener for agent events (GET). Uses URL query parameters as "
+            "payload to trigger processing. Honors an optional ``Idempotency-Key`` "
+            "header: a repeat request with the same key within 24h returns the "
+            "original cached response and does not re-enqueue the task."
+        ),
    )
    def get(self, webhook_token, agent, agent_id_str):
        payload = request.args.to_dict(flat=True)
-        return self._enqueue_webhook_task(agent_id_str, payload, source_method="GET")
+        return self._enqueue_webhook_task(
+            agent_id_str, payload, source_method="GET", agent=agent,
+        )
--- a/application/api/user/conversations/routes.py
+++ b/application/api/user/conversations/routes.py
@@ -4,8 +4,10 @@ import datetime

 from flask import current_app, jsonify, make_response, request
 from flask_restx import fields, Namespace, Resource
+from sqlalchemy import text as sql_text

 from application.api import api
+from application.storage.db.base_repository import looks_like_uuid, row_to_dict
 from application.storage.db.repositories.attachments import AttachmentsRepository
 from application.storage.db.repositories.conversations import ConversationsRepository
 from application.storage.db.session import db_readonly, db_session
@@ -104,6 +106,85 @@ class GetConversations(Resource):
        return make_response(jsonify(list_conversations), 200)


+@conversations_ns.route("/search_conversations")
+class SearchConversations(Resource):
+    @staticmethod
+    def _build_match_snippet(text_value: str, query: str, radius: int = 60) -> str:
+        if not text_value:
+            return ""
+        idx = text_value.lower().find(query.lower())
+        if idx == -1:
+            snippet = text_value[: radius * 2]
+            return snippet + ("…" if len(text_value) > len(snippet) else "")
+        start = max(0, idx - radius)
+        end = min(len(text_value), idx + len(query) + radius)
+        snippet = text_value[start:end]
+        if start > 0:
+            snippet = "…" + snippet
+        if end < len(text_value):
+            snippet = snippet + "…"
+        return snippet
+
+    @api.doc(
+        description=(
+            "Search the authenticated user's conversations by name or "
+            "message content (case-insensitive substring match). Mirrors "
+            "the visibility filter and response shape of /get_conversations, "
+            "and additionally returns ``match_field`` (``name``, ``prompt`` "
+            "or ``response``) and ``match_snippet`` (a short excerpt of the "
+            "matched text centered on the query) for each result."
+        ),
+        params={
+            "q": "Search term (required)",
+            "limit": "Maximum number of results to return (default 30, max 100)",
+        },
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        query = (request.args.get("q") or "").strip()
+        if not query:
+            return make_response(
+                jsonify({"success": False, "message": "q is required"}), 400
+            )
+        try:
+            limit = int(request.args.get("limit", 30))
+        except (TypeError, ValueError):
+            limit = 30
+        limit = max(1, min(limit, 100))
+        user_id = decoded_token.get("sub")
+        try:
+            with db_readonly() as conn:
+                conversations = ConversationsRepository(conn).search_for_user(
+                    user_id, query, limit=limit
+                )
+            list_conversations = [
+                {
+                    "id": str(conversation["id"]),
+                    "name": conversation["name"],
+                    "agent_id": (
+                        str(conversation["agent_id"])
+                        if conversation.get("agent_id")
+                        else None
+                    ),
+                    "is_shared_usage": conversation.get("is_shared_usage", False),
+                    "shared_token": conversation.get("shared_token", None),
+                    "match_field": conversation.get("match_field"),
+                    "match_snippet": self._build_match_snippet(
+                        conversation.get("match_text") or "", query
+                    ),
+                }
+                for conversation in conversations
+            ]
+        except Exception as err:
+            current_app.logger.error(
+                f"Error searching conversations: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify(list_conversations), 200)
+
+
@conversations_ns.route("/get_single_conversation")
 class GetSingleConversation(Resource):
    @api.doc(
@@ -133,6 +214,7 @@ class GetSingleConversation(Resource):
                attachments_repo = AttachmentsRepository(conn)
                queries = []
                for msg in messages:
+                    metadata = msg.get("metadata") or {}
                    query = {
                        "prompt": msg.get("prompt"),
                        "response": msg.get("response"),
@@ -141,9 +223,15 @@ class GetSingleConversation(Resource):
                        "tool_calls": msg.get("tool_calls") or [],
                        "timestamp": msg.get("timestamp"),
                        "model_id": msg.get("model_id"),
+                        # Lets the client distinguish placeholder rows from
+                        # finalised answers and tail-poll in-flight ones.
+                        "message_id": str(msg["id"]) if msg.get("id") else None,
+                        "status": msg.get("status"),
+                        "request_id": msg.get("request_id"),
+                        "last_heartbeat_at": metadata.get("last_heartbeat_at"),
                    }
-                    if msg.get("metadata"):
-                        query["metadata"] = msg["metadata"]
+                    if metadata:
+                        query["metadata"] = metadata
                    # Feedback on conversation_messages is a JSONB blob with
                    # shape {"text": <str>, "timestamp": <iso>}. The legacy
                    # frontend consumed a flat scalar feedback string, so
@@ -301,3 +389,61 @@ class SubmitFeedback(Resource):
            current_app.logger.error(f"Error submitting feedback: {err}", exc_info=True)
            return make_response(jsonify({"success": False}), 400)
        return make_response(jsonify({"success": True}), 200)
+
+
+@conversations_ns.route("/messages/<string:message_id>/tail")
+class GetMessageTail(Resource):
+    @api.doc(
+        description=(
+            "Current state of one conversation_messages row, scoped to the "
+            "authenticated user. Used to reconnect to an in-flight stream "
+            "after a refresh."
+        ),
+        params={"message_id": "Message UUID"},
+    )
+    def get(self, message_id):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        if not looks_like_uuid(message_id):
+            return make_response(
+                jsonify({"success": False, "message": "Invalid message id"}), 400
+            )
+        user_id = decoded_token.get("sub")
+        try:
+            with db_readonly() as conn:
+                # Owner-or-shared, matching ``ConversationsRepository.get``.
+                row = conn.execute(
+                    sql_text(
+                        "SELECT m.* FROM conversation_messages m "
+                        "JOIN conversations c ON c.id = m.conversation_id "
+                        "WHERE m.id = CAST(:mid AS uuid) "
+                        "AND (c.user_id = :uid OR :uid = ANY(c.shared_with))"
+                    ),
+                    {"mid": message_id, "uid": user_id},
+                ).fetchone()
+                if row is None:
+                    return make_response(jsonify({"status": "not found"}), 404)
+                msg = row_to_dict(row)
+        except Exception as err:
+            current_app.logger.error(
+                f"Error tailing message {message_id}: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+        metadata = msg.get("message_metadata") or {}
+        return make_response(
+            jsonify(
+                {
+                    "message_id": str(msg["id"]),
+                    "status": msg.get("status"),
+                    "response": msg.get("response"),
+                    "thought": msg.get("thought"),
+                    "sources": msg.get("sources") or [],
+                    "tool_calls": msg.get("tool_calls") or [],
+                    "request_id": msg.get("request_id"),
+                    "last_heartbeat_at": metadata.get("last_heartbeat_at"),
+                    "error": metadata.get("error"),
+                }
+            ),
+            200,
+        )
--- a/application/api/user/idempotency.py
+++ b/application/api/user/idempotency.py
@@ -0,0 +1,237 @@
+"""Per-Celery-task idempotency wrapper backed by ``task_dedup``."""
+
+from __future__ import annotations
+
+import functools
+import logging
+import threading
+import uuid
+from typing import Any, Callable, Optional
+
+from application.storage.db.repositories.idempotency import IdempotencyRepository
+from application.storage.db.session import db_readonly, db_session
+
+
+logger = logging.getLogger(__name__)
+
+
+# Poison-loop cap; transient-failure headroom without infinite retry.
+MAX_TASK_ATTEMPTS = 5
+
+# 30s heartbeat / 60s TTL → ~2 missed ticks of slack before reclaim.
+LEASE_TTL_SECONDS = 60
+LEASE_HEARTBEAT_INTERVAL = 30
+
+# 10 × 60s ≈ 5 min of deferral before giving up on a held lease.
+LEASE_RETRY_MAX = 10
+
+
+def with_idempotency(task_name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """Short-circuit on completed key; gate concurrent runs via a lease.
+
+    Entry short-circuits:
+      - completed row → return cached result
+      - live lease held → retry(countdown=LEASE_TTL_SECONDS)
+      - attempt_count > MAX_TASK_ATTEMPTS → poison-loop alert
+    Success writes ``completed``; exceptions leave ``pending`` for
+    autoretry until the poison-loop guard trips.
+    """
+
+    def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(fn)
+        def wrapper(self, *args: Any, idempotency_key: Any = None, **kwargs: Any) -> Any:
+            key = idempotency_key if isinstance(idempotency_key, str) and idempotency_key else None
+            if key is None:
+                return fn(self, *args, idempotency_key=idempotency_key, **kwargs)
+
+            cached = _lookup_completed(key)
+            if cached is not None:
+                logger.info(
+                    "idempotency hit for task=%s key=%s — returning cached result",
+                    task_name, key,
+                )
+                return cached
+
+            owner_id = str(uuid.uuid4())
+            attempt = _try_claim_lease(
+                key, task_name, _safe_task_id(self), owner_id,
+            )
+            if attempt is None:
+                # Live lease held by another worker. Re-queue and bail
+                # quickly — by the time the retry fires (LEASE_TTL
+                # seconds), Worker 1 has either finalised (we'll hit
+                # ``_lookup_completed`` and return cached) or its lease
+                # has expired and we can claim.
+                logger.info(
+                    "idempotency: live lease held; deferring task=%s key=%s",
+                    task_name, key,
+                )
+                raise self.retry(
+                    countdown=LEASE_TTL_SECONDS,
+                    max_retries=LEASE_RETRY_MAX,
+                )
+
+            if attempt > MAX_TASK_ATTEMPTS:
+                logger.error(
+                    "idempotency poison-loop guard: task=%s key=%s attempts=%s",
+                    task_name, key, attempt,
+                    extra={
+                        "alert": "idempotency_poison_loop",
+                        "task_name": task_name,
+                        "idempotency_key": key,
+                        "attempts": attempt,
+                    },
+                )
+                poisoned = {
+                    "success": False,
+                    "error": "idempotency poison-loop guard tripped",
+                    "attempts": attempt,
+                }
+                _finalize(key, poisoned, status="failed")
+                return poisoned
+
+            heartbeat_thread, heartbeat_stop = _start_lease_heartbeat(
+                key, owner_id,
+            )
+            try:
+                result = fn(self, *args, idempotency_key=idempotency_key, **kwargs)
+                _finalize(key, result, status="completed")
+                return result
+            except Exception:
+                # Drop the lease so the next retry doesn't wait LEASE_TTL.
+                _release_lease(key, owner_id)
+                raise
+            finally:
+                _stop_lease_heartbeat(heartbeat_thread, heartbeat_stop)
+
+        return wrapper
+
+    return decorator
+
+
+def _lookup_completed(key: str) -> Any:
+    """Return cached ``result_json`` if a completed row exists for ``key``, else None."""
+    with db_readonly() as conn:
+        row = IdempotencyRepository(conn).get_task(key)
+    if row is None:
+        return None
+    if row.get("status") != "completed":
+        return None
+    return row.get("result_json")
+
+
+def _try_claim_lease(
+    key: str, task_name: str, task_id: str, owner_id: str,
+) -> Optional[int]:
+    """Atomic CAS; returns ``attempt_count`` or ``None`` when held.
+
+    DB outage → treated as ``attempt=1`` so transient failures don't
+    block all task execution; reconciler repairs the lease columns.
+    """
+    try:
+        with db_session() as conn:
+            return IdempotencyRepository(conn).try_claim_lease(
+                key=key,
+                task_name=task_name,
+                task_id=task_id,
+                owner_id=owner_id,
+                ttl_seconds=LEASE_TTL_SECONDS,
+            )
+    except Exception:
+        logger.exception(
+            "idempotency lease-claim failed for key=%s task=%s", key, task_name,
+        )
+        return 1
+
+
+def _finalize(key: str, result_json: Any, *, status: str) -> None:
+    """Best-effort terminal write. Never let DB outage fail the task."""
+    try:
+        with db_session() as conn:
+            IdempotencyRepository(conn).finalize_task(
+                key=key, result_json=result_json, status=status,
+            )
+    except Exception:
+        logger.exception(
+            "idempotency finalize failed for key=%s status=%s", key, status,
+        )
+
+
+def _release_lease(key: str, owner_id: str) -> None:
+    """Best-effort lease release on the wrapper's exception path."""
+    try:
+        with db_session() as conn:
+            IdempotencyRepository(conn).release_lease(key, owner_id)
+    except Exception:
+        logger.exception("idempotency release-lease failed for key=%s", key)
+
+
+def _start_lease_heartbeat(
+    key: str, owner_id: str,
+) -> tuple[threading.Thread, threading.Event]:
+    """Spawn a daemon thread that bumps ``lease_expires_at`` every
+    :data:`LEASE_HEARTBEAT_INTERVAL` seconds until ``stop_event`` fires.
+
+    Mirrors ``application.worker._start_ingest_heartbeat`` so the two
+    durability heartbeats share shape and cadence.
+    """
+    stop_event = threading.Event()
+    thread = threading.Thread(
+        target=_lease_heartbeat_loop,
+        args=(key, owner_id, stop_event, LEASE_HEARTBEAT_INTERVAL),
+        daemon=True,
+        name=f"idempotency-lease-heartbeat:{key[:32]}",
+    )
+    thread.start()
+    return thread, stop_event
+
+
+def _stop_lease_heartbeat(
+    thread: threading.Thread, stop_event: threading.Event,
+) -> None:
+    """Signal the heartbeat thread to exit and join with a short timeout."""
+    stop_event.set()
+    thread.join(timeout=10)
+
+
+def _lease_heartbeat_loop(
+    key: str,
+    owner_id: str,
+    stop_event: threading.Event,
+    interval: int,
+) -> None:
+    """Refresh the lease until ``stop_event`` is set or ownership is lost.
+
+    A failed refresh (rowcount 0) means another worker stole the lease
+    after expiry — at that point the damage is already possible, so we
+    log and keep ticking. Don't escalate to thread death; the main task
+    body needs to keep running so its outcome is at least *recorded*.
+    """
+    while not stop_event.wait(interval):
+        try:
+            with db_session() as conn:
+                still_owned = IdempotencyRepository(conn).refresh_lease(
+                    key=key, owner_id=owner_id, ttl_seconds=LEASE_TTL_SECONDS,
+                )
+            if not still_owned:
+                logger.warning(
+                    "idempotency lease lost mid-task for key=%s "
+                    "(another worker may have taken over)",
+                    key,
+                )
+        except Exception:
+            logger.exception(
+                "idempotency lease-heartbeat tick failed for key=%s", key,
+            )
+
+
+def _safe_task_id(task_self: Any) -> str:
+    """Best-effort extraction of ``self.request.id`` from a Celery task."""
+    try:
+        request = getattr(task_self, "request", None)
+        task_id: Optional[str] = (
+            getattr(request, "id", None) if request is not None else None
+        )
+    except Exception:
+        task_id = None
+    return task_id or "unknown"
--- a/application/api/user/reconciliation.py
+++ b/application/api/user/reconciliation.py
@@ -0,0 +1,196 @@
+"""Reconciler tick: sweep stuck rows and escalate to terminal status + alert."""
+
+from __future__ import annotations
+
+import logging
+import uuid
+from typing import Any, Dict, Optional
+
+from sqlalchemy import Connection
+
+from application.api.user.idempotency import MAX_TASK_ATTEMPTS
+from application.core.settings import settings
+from application.storage.db.engine import get_engine
+from application.storage.db.repositories.reconciliation import (
+    ReconciliationRepository,
+)
+from application.storage.db.repositories.stack_logs import StackLogsRepository
+
+logger = logging.getLogger(__name__)
+
+
+MAX_MESSAGE_RECONCILE_ATTEMPTS = 3
+
+
+def run_reconciliation() -> Dict[str, Any]:
+    """Single tick of the reconciler. Five sweeps, FOR UPDATE SKIP LOCKED.
+
+    Stuck ``executed`` tool calls always flip to ``failed`` — operators
+    handle cleanup manually via the structured alert. The side effect is
+    assumed to have committed; no automated rollback is attempted.
+
+    Stuck ``task_dedup`` rows (lease expired AND attempts >= max)
+    promote to ``failed`` so a same-key retry can re-claim instead of
+    sitting in ``pending`` until 24 h TTL.
+    """
+    if not settings.POSTGRES_URI:
+        return {
+            "messages_failed": 0,
+            "tool_calls_failed": 0,
+            "skipped": "POSTGRES_URI not set",
+        }
+
+    engine = get_engine()
+    summary = {
+        "messages_failed": 0,
+        "tool_calls_failed": 0,
+        "ingests_stalled": 0,
+        "idempotency_pending_failed": 0,
+    }
+
+    with engine.begin() as conn:
+        repo = ReconciliationRepository(conn)
+        for msg in repo.find_and_lock_stuck_messages():
+            new_count = repo.increment_message_reconcile_attempts(msg["id"])
+            if new_count >= MAX_MESSAGE_RECONCILE_ATTEMPTS:
+                repo.mark_message_failed(
+                    msg["id"],
+                    error=(
+                        "reconciler: stuck in pending/streaming for >5 min "
+                        f"after {new_count} attempts"
+                    ),
+                )
+                summary["messages_failed"] += 1
+                _emit_alert(
+                    conn,
+                    name="reconciler_message_failed",
+                    user_id=msg.get("user_id"),
+                    detail={
+                        "message_id": str(msg["id"]),
+                        "attempts": new_count,
+                    },
+                )
+
+    with engine.begin() as conn:
+        repo = ReconciliationRepository(conn)
+        for row in repo.find_and_lock_proposed_tool_calls():
+            repo.mark_tool_call_failed(
+                row["call_id"],
+                error=(
+                    "reconciler: stuck in 'proposed' for >5 min; "
+                    "side effect status unknown"
+                ),
+            )
+            summary["tool_calls_failed"] += 1
+            _emit_alert(
+                conn,
+                name="reconciler_tool_call_failed_proposed",
+                user_id=None,
+                detail={
+                    "call_id": row["call_id"],
+                    "tool_name": row.get("tool_name"),
+                },
+            )
+
+    with engine.begin() as conn:
+        repo = ReconciliationRepository(conn)
+        for row in repo.find_and_lock_executed_tool_calls():
+            repo.mark_tool_call_failed(
+                row["call_id"],
+                error=(
+                    "reconciler: executed-not-confirmed; side effect "
+                    "assumed committed, manual cleanup required"
+                ),
+            )
+            summary["tool_calls_failed"] += 1
+            _emit_alert(
+                conn,
+                name="reconciler_tool_call_failed_executed",
+                user_id=None,
+                detail={
+                    "call_id": row["call_id"],
+                    "tool_name": row.get("tool_name"),
+                    "action_name": row.get("action_name"),
+                },
+            )
+
+    # Q4: ingest checkpoints whose heartbeat has gone silent. The
+    # reconciler only escalates (alerts) — it doesn't kill the worker
+    # or roll back the partial embed. The next dispatch resumes from
+    # ``last_index`` thanks to the per-chunk checkpoint, so this is an
+    # observability sweep, not a recovery action.
+    with engine.begin() as conn:
+        repo = ReconciliationRepository(conn)
+        for row in repo.find_and_lock_stalled_ingests():
+            summary["ingests_stalled"] += 1
+            _emit_alert(
+                conn,
+                name="reconciler_ingest_stalled",
+                user_id=None,
+                detail={
+                    "source_id": str(row.get("source_id")),
+                    "embedded_chunks": row.get("embedded_chunks"),
+                    "total_chunks": row.get("total_chunks"),
+                    "last_updated": str(row.get("last_updated")),
+                },
+            )
+            # Bump the heartbeat so we don't re-alert every tick.
+            repo.touch_ingest_progress(str(row["source_id"]))
+
+    # Q5: idempotency rows whose lease expired with attempts exhausted.
+    # The wrapper's poison-loop guard normally finalises these, but if
+    # the wrapper itself died mid-task (worker SIGKILL, OOM during
+    # heartbeat) the row sits in ``pending`` blocking same-key retries
+    # via ``_lookup_completed`` returning None for the whole 24 h TTL.
+    # Promote to ``failed`` so a retry can re-claim and either resume
+    # or fail loudly.
+    with engine.begin() as conn:
+        repo = ReconciliationRepository(conn)
+        for row in repo.find_stuck_idempotency_pending(
+            max_attempts=MAX_TASK_ATTEMPTS,
+        ):
+            error_msg = (
+                "reconciler: idempotency lease expired with attempts "
+                f"({row['attempt_count']}) >= {MAX_TASK_ATTEMPTS}; "
+                "task abandoned"
+            )
+            repo.mark_idempotency_pending_failed(
+                row["idempotency_key"], error=error_msg,
+            )
+            summary["idempotency_pending_failed"] += 1
+            _emit_alert(
+                conn,
+                name="reconciler_idempotency_pending_failed",
+                user_id=None,
+                detail={
+                    "idempotency_key": row["idempotency_key"],
+                    "task_name": row.get("task_name"),
+                    "task_id": row.get("task_id"),
+                    "attempts": row.get("attempt_count"),
+                },
+            )
+
+    return summary
+
+
+def _emit_alert(
+    conn: Connection,
+    *,
+    name: str,
+    user_id: Optional[str],
+    detail: Dict[str, Any],
+) -> None:
+    """Structured ``logger.error`` plus a ``stack_logs`` row for operators."""
+    extra = {"alert": name, **detail}
+    logger.error("reconciler alert: %s", name, extra=extra)
+    try:
+        StackLogsRepository(conn).insert(
+            activity_id=str(uuid.uuid4()),
+            endpoint="reconciliation_worker",
+            level="ERROR",
+            user_id=user_id,
+            query=name,
+            stacks=[extra],
+        )
+    except Exception:
+        logger.exception("reconciler: failed to write stack_logs row for %s", name)
--- a/application/api/user/sources/upload.py
+++ b/application/api/user/sources/upload.py
@@ -3,16 +3,19 @@
 import json
 import os
 import tempfile
+import uuid
 import zipfile

 from flask import current_app, jsonify, make_response, request
 from flask_restx import fields, Namespace, Resource
+from sqlalchemy import text as sql_text

 from application.api import api
 from application.api.user.tasks import ingest, ingest_connector_task, ingest_remote
 from application.core.settings import settings
 from application.parser.connectors.connector_creator import ConnectorCreator
 from application.parser.file.constants import SUPPORTED_SOURCE_EXTENSIONS
+from application.storage.db.repositories.idempotency import IdempotencyRepository
 from application.storage.db.repositories.sources import SourcesRepository
 from application.storage.db.session import db_readonly, db_session
 from application.storage.storage_creator import StorageCreator
@@ -30,6 +33,79 @@ sources_upload_ns = Namespace(
 )


+_IDEMPOTENCY_KEY_MAX_LEN = 256
+
+
+def _read_idempotency_key():
+    """Return (key, error_response). Empty header → (None, None); oversized → (None, 400)."""
+    key = request.headers.get("Idempotency-Key")
+    if not key:
+        return None, None
+    if len(key) > _IDEMPOTENCY_KEY_MAX_LEN:
+        return None, make_response(
+            jsonify(
+                {
+                    "success": False,
+                    "message": (
+                        f"Idempotency-Key exceeds maximum length of "
+                        f"{_IDEMPOTENCY_KEY_MAX_LEN} characters"
+                    ),
+                }
+            ),
+            400,
+        )
+    return key, None
+
+
+def _scoped_idempotency_key(idempotency_key, scope):
+    """``{scope}:{key}`` so different users can't collide on the same key."""
+    if not idempotency_key or not scope:
+        return None
+    return f"{scope}:{idempotency_key}"
+
+
+def _claim_task_or_get_cached(key, task_name):
+    """Claim ``key`` for this request OR return the winner's cached payload.
+
+    Pre-generates the celery task_id so a losing writer sees the same
+    id immediately. Returns ``(task_id, cached_response)``; non-None
+    cached means the caller should return without enqueuing.
+    """
+    predetermined_id = str(uuid.uuid4())
+    with db_session() as conn:
+        claimed = IdempotencyRepository(conn).claim_task(
+            key=key, task_name=task_name, task_id=predetermined_id,
+        )
+    if claimed is not None:
+        return claimed["task_id"], None
+    with db_readonly() as conn:
+        existing = IdempotencyRepository(conn).get_task(key)
+    cached_id = existing.get("task_id") if existing else None
+    return None, {
+        "success": True,
+        "task_id": cached_id or "deduplicated",
+    }
+
+
+def _release_claim(key):
+    """Drop a pending claim so a client retry can re-claim it."""
+    try:
+        with db_session() as conn:
+            conn.execute(
+                sql_text(
+                    "DELETE FROM task_dedup WHERE idempotency_key = :k "
+                    "AND status = 'pending'"
+                ),
+                {"k": key},
+            )
+    except Exception:
+        current_app.logger.exception(
+            "Failed to release task_dedup claim for key=%s", key,
+        )
+
+
+
+
 def _enforce_audio_path_size_limit(file_path: str, filename: str) -> None:
    if not is_audio_filename(filename):
        return
@@ -49,17 +125,38 @@ class UploadFile(Resource):
        )
    )
    @api.doc(
-        description="Uploads a file to be vectorized and indexed",
+        description=(
+            "Uploads a file to be vectorized and indexed. Honors an optional "
+            "``Idempotency-Key`` header: a repeat request with the same key "
+            "within 24h returns the original cached response without re-enqueuing."
+        ),
    )
    def post(self):
        decoded_token = request.decoded_token
        if not decoded_token:
            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        idempotency_key, key_error = _read_idempotency_key()
+        if key_error is not None:
+            return key_error
+        # User-scoped to avoid cross-user collisions; also feeds
+        # ``_derive_source_id`` so uuid5 stays user-disjoint.
+        scoped_key = _scoped_idempotency_key(idempotency_key, user)
+        # Claim before enqueue; the loser returns the winner's task_id.
+        predetermined_task_id = None
+        if scoped_key:
+            predetermined_task_id, cached = _claim_task_or_get_cached(
+                scoped_key, "ingest",
+            )
+            if cached is not None:
+                return make_response(jsonify(cached), 200)
        data = request.form
        files = request.files.getlist("file")
        required_fields = ["user", "name"]
        missing_fields = check_required_fields(data, required_fields)
        if missing_fields or not files or all(file.filename == "" for file in files):
+            if scoped_key:
+                _release_claim(scoped_key)
            return make_response(
                jsonify(
                    {
@@ -69,7 +166,6 @@ class UploadFile(Resource):
                ),
                400,
            )
-        user = decoded_token.get("sub")
        job_name = request.form["name"]

        # Create safe versions for filesystem operations
@@ -140,16 +236,27 @@ class UploadFile(Resource):
                        file_path = f"{base_path}/{safe_file}"
                        with open(temp_file_path, "rb") as f:
                            storage.save_file(f, file_path)
-            task = ingest.delay(
-                settings.UPLOAD_FOLDER,
-                list(SUPPORTED_SOURCE_EXTENSIONS),
-                job_name,
-                user,
-                file_path=base_path,
-                filename=dir_name,
-                file_name_map=file_name_map,
+            ingest_kwargs = dict(
+                args=(
+                    settings.UPLOAD_FOLDER,
+                    list(SUPPORTED_SOURCE_EXTENSIONS),
+                    job_name,
+                    user,
+                ),
+                kwargs={
+                    "file_path": base_path,
+                    "filename": dir_name,
+                    "file_name_map": file_name_map,
+                    # Scoped so the worker dedup row matches the HTTP claim.
+                    "idempotency_key": scoped_key or idempotency_key,
+                },
            )
+            if predetermined_task_id is not None:
+                ingest_kwargs["task_id"] = predetermined_task_id
+            task = ingest.apply_async(**ingest_kwargs)
        except AudioFileTooLargeError:
+            if scoped_key:
+                _release_claim(scoped_key)
            return make_response(
                jsonify(
                    {
@@ -161,8 +268,13 @@ class UploadFile(Resource):
            )
        except Exception as err:
            current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
+            if scoped_key:
+                _release_claim(scoped_key)
            return make_response(jsonify({"success": False}), 400)
-        return make_response(jsonify({"success": True, "task_id": task.id}), 200)
+        # Predetermined id matches the dedup-claim row; loser GET sees same.
+        response_task_id = predetermined_task_id or task.id
+        response_payload = {"success": True, "task_id": response_task_id}
+        return make_response(jsonify(response_payload), 200)


@sources_upload_ns.route("/remote")
@@ -182,17 +294,38 @@ class UploadRemote(Resource):
        )
    )
    @api.doc(
-        description="Uploads remote source for vectorization",
+        description=(
+            "Uploads remote source for vectorization. Honors an optional "
+            "``Idempotency-Key`` header: a repeat request with the same key "
+            "within 24h returns the original cached response without re-enqueuing."
+        ),
    )
    def post(self):
        decoded_token = request.decoded_token
        if not decoded_token:
            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        idempotency_key, key_error = _read_idempotency_key()
+        if key_error is not None:
+            return key_error
+        scoped_key = _scoped_idempotency_key(idempotency_key, user)
        data = request.form
        required_fields = ["user", "source", "name", "data"]
        missing_fields = check_required_fields(data, required_fields)
        if missing_fields:
            return missing_fields
+        task_name_for_dedup = (
+            "ingest_connector_task"
+            if data.get("source") in ConnectorCreator.get_supported_connectors()
+            else "ingest_remote"
+        )
+        predetermined_task_id = None
+        if scoped_key:
+            predetermined_task_id, cached = _claim_task_or_get_cached(
+                scoped_key, task_name_for_dedup,
+            )
+            if cached is not None:
+                return make_response(jsonify(cached), 200)
        try:
            config = json.loads(data["data"])
            source_data = None
@@ -208,6 +341,8 @@ class UploadRemote(Resource):
            elif data["source"] in ConnectorCreator.get_supported_connectors():
                session_token = config.get("session_token")
                if not session_token:
+                    if scoped_key:
+                        _release_claim(scoped_key)
                    return make_response(
                        jsonify(
                            {
@@ -236,31 +371,47 @@ class UploadRemote(Resource):
                config["file_ids"] = file_ids
                config["folder_ids"] = folder_ids

-                task = ingest_connector_task.delay(
-                    job_name=data["name"],
-                    user=decoded_token.get("sub"),
-                    source_type=data["source"],
-                    session_token=session_token,
-                    file_ids=file_ids,
-                    folder_ids=folder_ids,
-                    recursive=config.get("recursive", False),
-                    retriever=config.get("retriever", "classic"),
-                )
-                return make_response(
-                    jsonify({"success": True, "task_id": task.id}), 200
-                )
-            task = ingest_remote.delay(
-                source_data=source_data,
-                job_name=data["name"],
-                user=decoded_token.get("sub"),
-                loader=data["source"],
-            )
+                connector_kwargs = {
+                    "kwargs": {
+                        "job_name": data["name"],
+                        "user": user,
+                        "source_type": data["source"],
+                        "session_token": session_token,
+                        "file_ids": file_ids,
+                        "folder_ids": folder_ids,
+                        "recursive": config.get("recursive", False),
+                        "retriever": config.get("retriever", "classic"),
+                        "idempotency_key": scoped_key or idempotency_key,
+                    },
+                }
+                if predetermined_task_id is not None:
+                    connector_kwargs["task_id"] = predetermined_task_id
+                task = ingest_connector_task.apply_async(**connector_kwargs)
+                response_task_id = predetermined_task_id or task.id
+                response_payload = {"success": True, "task_id": response_task_id}
+                return make_response(jsonify(response_payload), 200)
+            remote_kwargs = {
+                "kwargs": {
+                    "source_data": source_data,
+                    "job_name": data["name"],
+                    "user": user,
+                    "loader": data["source"],
+                    "idempotency_key": scoped_key or idempotency_key,
+                },
+            }
+            if predetermined_task_id is not None:
+                remote_kwargs["task_id"] = predetermined_task_id
+            task = ingest_remote.apply_async(**remote_kwargs)
        except Exception as err:
            current_app.logger.error(
                f"Error uploading remote source: {err}", exc_info=True
            )
+            if scoped_key:
+                _release_claim(scoped_key)
            return make_response(jsonify({"success": False}), 400)
-        return make_response(jsonify({"success": True, "task_id": task.id}), 200)
+        response_task_id = predetermined_task_id or task.id
+        response_payload = {"success": True, "task_id": response_task_id}
+        return make_response(jsonify(response_payload), 200)


@sources_upload_ns.route("/manage_source_files")
@@ -305,6 +456,10 @@ class ManageSourceFiles(Resource):
                jsonify({"success": False, "message": "Unauthorized"}), 401
            )
        user = decoded_token.get("sub")
+        idempotency_key, key_error = _read_idempotency_key()
+        if key_error is not None:
+            return key_error
+        scoped_key = _scoped_idempotency_key(idempotency_key, user)
        source_id = request.form.get("source_id")
        operation = request.form.get("operation")

@@ -347,6 +502,12 @@ class ManageSourceFiles(Resource):
                jsonify({"success": False, "message": "Database error"}), 500
            )
        resolved_source_id = str(source["id"])
+        # Flips to True after each branch's ``apply_async`` returns
+        # successfully — at that point the worker owns the predetermined
+        # task_id. The outer ``except`` only releases the claim while
+        # this is False, so a post-``apply_async`` failure (jsonify,
+        # make_response, etc.) doesn't double-enqueue on the next retry.
+        claim_transferred = False
        try:
            storage = StorageCreator.get_storage()
            source_file_path = source.get("file_path", "")
@@ -379,6 +540,21 @@ class ManageSourceFiles(Resource):
                        ),
                        400,
                    )
+
+                # Claim before any storage mutation so a duplicate request
+                # short-circuits without touching the filesystem. Mirrors
+                # the pattern in ``UploadFile.post`` / ``UploadRemote.post``
+                # — without it ``.delay()`` would enqueue twice for two
+                # racing same-key POSTs (the worker decorator only
+                # deduplicates *after* completion).
+                predetermined_task_id = None
+                if scoped_key:
+                    predetermined_task_id, cached = _claim_task_or_get_cached(
+                        scoped_key, "reingest_source_task",
+                    )
+                    if cached is not None:
+                        return make_response(jsonify(cached), 200)
+
                added_files = []
                map_updated = False

@@ -414,9 +590,15 @@ class ManageSourceFiles(Resource):

                from application.api.user.tasks import reingest_source_task

-                task = reingest_source_task.delay(
-                    source_id=resolved_source_id, user=user
+                task = reingest_source_task.apply_async(
+                    kwargs={
+                        "source_id": resolved_source_id,
+                        "user": user,
+                        "idempotency_key": scoped_key or idempotency_key,
+                    },
+                    task_id=predetermined_task_id,
                )
+                claim_transferred = True

                return make_response(
                    jsonify(
@@ -455,10 +637,8 @@ class ManageSourceFiles(Resource):
                        ),
                        400,
                    )
-                # Remove files from storage and directory structure
-
-                removed_files = []
-                map_updated = False
+                # Path-traversal guard runs *before* the claim so a 400
+                # for an invalid path doesn't leave a pending dedup row.
                for file_path in file_paths:
                    if ".." in str(file_path) or str(file_path).startswith("/"):
                        return make_response(
@@ -470,6 +650,22 @@ class ManageSourceFiles(Resource):
                            ),
                            400,
                        )
+
+                # Claim before any storage mutation. See ``add`` branch
+                # comment for rationale.
+                predetermined_task_id = None
+                if scoped_key:
+                    predetermined_task_id, cached = _claim_task_or_get_cached(
+                        scoped_key, "reingest_source_task",
+                    )
+                    if cached is not None:
+                        return make_response(jsonify(cached), 200)
+
+                # Remove files from storage and directory structure
+
+                removed_files = []
+                map_updated = False
+                for file_path in file_paths:
                    full_path = f"{source_file_path}/{file_path}"

                    # Remove from storage
@@ -491,9 +687,15 @@ class ManageSourceFiles(Resource):

                from application.api.user.tasks import reingest_source_task

-                task = reingest_source_task.delay(
-                    source_id=resolved_source_id, user=user
+                task = reingest_source_task.apply_async(
+                    kwargs={
+                        "source_id": resolved_source_id,
+                        "user": user,
+                        "idempotency_key": scoped_key or idempotency_key,
+                    },
+                    task_id=predetermined_task_id,
                )
+                claim_transferred = True

                return make_response(
                    jsonify(
@@ -552,6 +754,16 @@ class ManageSourceFiles(Resource):
                        ),
                        404,
                    )
+
+                # Claim before mutation. See ``add`` branch for rationale.
+                predetermined_task_id = None
+                if scoped_key:
+                    predetermined_task_id, cached = _claim_task_or_get_cached(
+                        scoped_key, "reingest_source_task",
+                    )
+                    if cached is not None:
+                        return make_response(jsonify(cached), 200)
+
                success = storage.remove_directory(full_directory_path)

                if not success:
@@ -560,6 +772,11 @@ class ManageSourceFiles(Resource):
                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
                        f"Full path: {full_directory_path}"
                    )
+                    # Release so a client retry can reclaim — otherwise
+                    # the next request would silently 200-cache to the
+                    # task_id that never enqueued.
+                    if scoped_key:
+                        _release_claim(scoped_key)
                    return make_response(
                        jsonify(
                            {"success": False, "message": "Failed to remove directory"}
@@ -591,9 +808,15 @@ class ManageSourceFiles(Resource):

                from application.api.user.tasks import reingest_source_task

-                task = reingest_source_task.delay(
-                    source_id=resolved_source_id, user=user
+                task = reingest_source_task.apply_async(
+                    kwargs={
+                        "source_id": resolved_source_id,
+                        "user": user,
+                        "idempotency_key": scoped_key or idempotency_key,
+                    },
+                    task_id=predetermined_task_id,
                )
+                claim_transferred = True

                return make_response(
                    jsonify(
@@ -607,6 +830,14 @@ class ManageSourceFiles(Resource):
                    200,
                )
        except Exception as err:
+            # Release the dedup claim only if it wasn't transferred to
+            # a worker. Without this, a same-key retry within the 24h
+            # TTL would 200-cache to a predetermined task_id whose
+            # ``apply_async`` never ran (or ran but the response builder
+            # blew up afterward — only the first case matters in
+            # practice; the flag protects both).
+            if scoped_key and not claim_transferred:
+                _release_claim(scoped_key)
            error_context = f"operation={operation}, user={user}, source_id={source_id}"
            if operation == "remove_directory":
                directory_path = request.form.get("directory_path", "")
--- a/application/api/user/tasks.py
+++ b/application/api/user/tasks.py
@@ -1,5 +1,6 @@
 from datetime import timedelta

+from application.api.user.idempotency import with_idempotency
 from application.celery_init import celery
 from application.worker import (
    agent_webhook_worker,
@@ -13,9 +14,32 @@ from application.worker import (
 )


-@celery.task(bind=True)
+# Shared decorator config for long-running, side-effecting tasks. ``acks_late``
+# is also the celeryconfig default but stays explicit here so each task's
+# durability story is grep-able next to the body. Combined with
+# ``autoretry_for=(Exception,)`` and a bounded ``max_retries`` so a poison
+# message can't loop forever.
+DURABLE_TASK = dict(
+    bind=True,
+    acks_late=True,
+    autoretry_for=(Exception,),
+    retry_kwargs={"max_retries": 3, "countdown": 60},
+    retry_backoff=True,
+)
+
+
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="ingest")
 def ingest(
-    self, directory, formats, job_name, user, file_path, filename, file_name_map=None
+    self,
+    directory,
+    formats,
+    job_name,
+    user,
+    file_path,
+    filename,
+    file_name_map=None,
+    idempotency_key=None,
 ):
    resp = ingest_worker(
        self,
@@ -26,25 +50,35 @@ def ingest(
        filename,
        user,
        file_name_map=file_name_map,
+        idempotency_key=idempotency_key,
    )
    return resp


-@celery.task(bind=True)
-def ingest_remote(self, source_data, job_name, user, loader):
-    resp = remote_worker(self, source_data, job_name, user, loader)
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="ingest_remote")
+def ingest_remote(self, source_data, job_name, user, loader, idempotency_key=None):
+    resp = remote_worker(
+        self, source_data, job_name, user, loader,
+        idempotency_key=idempotency_key,
+    )
    return resp


-@celery.task(bind=True)
-def reingest_source_task(self, source_id, user):
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="reingest_source_task")
+def reingest_source_task(self, source_id, user, idempotency_key=None):
    from application.worker import reingest_source_worker

    resp = reingest_source_worker(self, source_id, user)
    return resp


-@celery.task(bind=True)
+# Beat-driven dispatch tasks default to ``acks_late=False``: a SIGKILL
+# of a beat tick is harmless to redeliver only if the dispatch itself is
+# idempotent. We keep these early-ACK so the broker doesn't replay a
+# dispatch that already enqueued downstream work.
+@celery.task(bind=True, acks_late=False)
 def schedule_syncs(self, frequency):
    resp = sync_worker(self, frequency)
    return resp
@@ -74,19 +108,22 @@ def sync_source(
    return resp


-@celery.task(bind=True)
-def store_attachment(self, file_info, user):
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="store_attachment")
+def store_attachment(self, file_info, user, idempotency_key=None):
    resp = attachment_worker(self, file_info, user)
    return resp


-@celery.task(bind=True)
-def process_agent_webhook(self, agent_id, payload):
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="process_agent_webhook")
+def process_agent_webhook(self, agent_id, payload, idempotency_key=None):
    resp = agent_webhook_worker(self, agent_id, payload)
    return resp


-@celery.task(bind=True)
+@celery.task(**DURABLE_TASK)
+@with_idempotency(task_name="ingest_connector_task")
 def ingest_connector_task(
    self,
    job_name,
@@ -100,6 +137,7 @@ def ingest_connector_task(
    operation_mode="upload",
    doc_id=None,
    sync_frequency="never",
+    idempotency_key=None,
 ):
    from application.worker import ingest_connector

@@ -116,6 +154,7 @@ def ingest_connector_task(
        operation_mode=operation_mode,
        doc_id=doc_id,
        sync_frequency=sync_frequency,
+        idempotency_key=idempotency_key,
    )
    return resp

@@ -140,6 +179,19 @@ def setup_periodic_tasks(sender, **kwargs):
        cleanup_pending_tool_state.s(),
        name="cleanup-pending-tool-state",
    )
+    # Pure housekeeping for ``task_dedup`` / ``webhook_dedup`` — the
+    # upsert paths already handle stale rows, so cadence only bounds
+    # table size. Hourly is plenty for typical traffic.
+    sender.add_periodic_task(
+        timedelta(hours=1),
+        cleanup_idempotency_dedup.s(),
+        name="cleanup-idempotency-dedup",
+    )
+    sender.add_periodic_task(
+        timedelta(seconds=30),
+        reconciliation_task.s(),
+        name="reconciliation",
+    )
    sender.add_periodic_task(
        timedelta(hours=7),
        version_check_task.s(),
@@ -159,18 +211,12 @@ def mcp_oauth_status_task(self, task_id):
    return resp


-@celery.task(bind=True)
+@celery.task(bind=True, acks_late=False)
 def cleanup_pending_tool_state(self):
-    """Delete pending_tool_state rows past their TTL.
-
-    Replaces Mongo's ``expireAfterSeconds=0`` TTL index — Postgres has
-    no native TTL, so this task runs every 60 seconds to keep
-    ``pending_tool_state`` bounded. No-ops if ``POSTGRES_URI`` isn't
-    configured (keeps the task runnable in Mongo-only environments).
-    """
+    """Revert stale ``resuming`` rows, then delete TTL-expired rows."""
    from application.core.settings import settings
    if not settings.POSTGRES_URI:
-        return {"deleted": 0, "skipped": "POSTGRES_URI not set"}
+        return {"deleted": 0, "reverted": 0, "skipped": "POSTGRES_URI not set"}

    from application.storage.db.engine import get_engine
    from application.storage.db.repositories.pending_tool_state import (
@@ -179,11 +225,47 @@ def cleanup_pending_tool_state(self):

    engine = get_engine()
    with engine.begin() as conn:
-        deleted = PendingToolStateRepository(conn).cleanup_expired()
-    return {"deleted": deleted}
+        repo = PendingToolStateRepository(conn)
+        reverted = repo.revert_stale_resuming(grace_seconds=600)
+        deleted = repo.cleanup_expired()
+    return {"deleted": deleted, "reverted": reverted}


-@celery.task(bind=True)
+@celery.task(bind=True, acks_late=False)
+def cleanup_idempotency_dedup(self):
+    """Delete TTL-expired rows from ``task_dedup`` and ``webhook_dedup``.
+
+    Pure housekeeping — the upsert paths already ignore stale rows
+    (TTL-aware ``ON CONFLICT DO UPDATE``), so this only bounds table
+    growth and keeps SELECT planning tight on large deployments.
+    """
+    from application.core.settings import settings
+    if not settings.POSTGRES_URI:
+        return {
+            "task_dedup_deleted": 0,
+            "webhook_dedup_deleted": 0,
+            "skipped": "POSTGRES_URI not set",
+        }
+
+    from application.storage.db.engine import get_engine
+    from application.storage.db.repositories.idempotency import (
+        IdempotencyRepository,
+    )
+
+    engine = get_engine()
+    with engine.begin() as conn:
+        return IdempotencyRepository(conn).cleanup_expired()
+
+
+@celery.task(bind=True, acks_late=False)
+def reconciliation_task(self):
+    """Sweep stuck durability rows and escalate them to terminal status + alert."""
+    from application.api.user.reconciliation import run_reconciliation
+
+    return run_reconciliation()
+
+
+@celery.task(bind=True, acks_late=False)
 def version_check_task(self):
    """Periodic anonymous version check.

--- a/application/api/v1/routes.py
+++ b/application/api/v1/routes.py
@@ -9,6 +9,7 @@ import json
 import logging
 import time
 import traceback
+from datetime import datetime
 from typing import Any, Dict, Generator, Optional

 from flask import Blueprint, jsonify, make_response, request, Response
@@ -306,7 +307,16 @@ def list_models():
                    401,
                )

+        # Repository rows now go through ``coerce_pg_native`` at SELECT
+        # time, so timestamps arrive as ISO 8601 strings. Parse before
+        # taking ``.timestamp()``; fall back to ``time.time()`` only when
+        # the value is genuinely missing or unparseable.
        created = agent.get("created_at") or agent.get("createdAt")
+        if isinstance(created, str):
+            try:
+                created = datetime.fromisoformat(created)
+            except (ValueError, TypeError):
+                created = None
        created_ts = (
            int(created.timestamp()) if hasattr(created, "timestamp")
            else int(time.time())
--- a/application/app.py
+++ b/application/app.py
@@ -200,7 +200,9 @@ def _bind_user_id_to_log_context():
 def after_request(response: Response) -> Response:
    """Add CORS headers for the pure Flask development entrypoint."""
    response.headers["Access-Control-Allow-Origin"] = "*"
-    response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
+    response.headers["Access-Control-Allow-Headers"] = (
+        "Content-Type, Authorization, Idempotency-Key"
+    )
    response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
    return response

--- a/application/asgi.py
+++ b/application/asgi.py
@@ -25,7 +25,12 @@ asgi_app = Starlette(
            CORSMiddleware,
            allow_origins=["*"],
            allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
-            allow_headers=["Content-Type", "Authorization", "Mcp-Session-Id"],
+            allow_headers=[
+                "Content-Type",
+                "Authorization",
+                "Mcp-Session-Id",
+                "Idempotency-Key",
+            ],
            expose_headers=["Mcp-Session-Id"],
        ),
    ],
--- a/application/celeryconfig.py
+++ b/application/celeryconfig.py
@@ -1,7 +1,10 @@
-import os
+from application.core.settings import settings

-broker_url = os.getenv("CELERY_BROKER_URL")
-result_backend = os.getenv("CELERY_RESULT_BACKEND")
+# Pydantic loads .env into ``settings`` but does not inject values into
+# ``os.environ`` — read directly from settings so beat startup (which
+# imports this module before any explicit env load) sees a real URL.
+broker_url = settings.CELERY_BROKER_URL
+result_backend = settings.CELERY_RESULT_BACKEND

 task_serializer = 'json'
 result_serializer = 'json'
@@ -10,7 +13,21 @@ accept_content = ['json']
 # Autodiscover tasks
 imports = ('application.api.user.tasks',)

+# Project-scoped queue so a stray sibling worker on the same broker
+# (other repo, same default ``celery`` queue) can't grab DocsGPT tasks.
+task_default_queue = "docsgpt"
+task_default_exchange = "docsgpt"
+task_default_routing_key = "docsgpt"
+
 beat_scheduler = "redbeat.RedBeatScheduler"
 redbeat_redis_url = broker_url
 redbeat_key_prefix = "redbeat:docsgpt:"
 redbeat_lock_timeout = 90
+
+# Survive worker SIGKILL/OOM without silently dropping in-flight tasks.
+task_acks_late = True
+task_reject_on_worker_lost = True
+worker_prefetch_multiplier = settings.CELERY_WORKER_PREFETCH_MULTIPLIER
+broker_transport_options = {"visibility_timeout": settings.CELERY_VISIBILITY_TIMEOUT}
+result_expires = 86400 * 7
+task_track_started = True
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -30,6 +30,12 @@ class Settings(BaseSettings):

    CELERY_BROKER_URL: str = "redis://localhost:6379/0"
    CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
+    # Prefetch=1 caps SIGKILL loss to one task. Visibility timeout must exceed
+    # the longest legitimate task runtime (ingest, agent webhook) but stay
+    # short enough that SIGKILLed tasks redeliver promptly. 1h matches Onyx
+    # and Dify defaults; long ingests can override via env.
+    CELERY_WORKER_PREFETCH_MULTIPLIER: int = 1
+    CELERY_VISIBILITY_TIMEOUT: int = 3600
    # Only consulted when VECTOR_STORE=mongodb or when running scripts/db/backfill.py; user data lives in Postgres.
    MONGO_URI: Optional[str] = None
    # User-data Postgres DB.
--- a/application/llm/base.py
+++ b/application/llm/base.py
@@ -80,6 +80,14 @@ class BaseLLM(ABC):
                    agent_id=self.agent_id,
                    model_user_id=self.model_user_id,
                )
+                # Tag the fallback LLM so its rows land as
+                # ``source='fallback'`` in cost-attribution dashboards.
+                # Propagate the parent's ``_request_id`` so a user
+                # request that ran fallback is still grouped under one id.
+                self._fallback_llm._token_usage_source = "fallback"
+                self._fallback_llm._request_id = getattr(
+                    self, "_request_id", None,
+                )
                logger.info(
                    f"Fallback LLM initialized from agent backup model: "
                    f"{provider}/{backup_model_id}"
@@ -106,6 +114,11 @@ class BaseLLM(ABC):
                    agent_id=self.agent_id,
                    model_user_id=self.model_user_id,
                )
+                # Same rationale as the agent-backup branch.
+                self._fallback_llm._token_usage_source = "fallback"
+                self._fallback_llm._request_id = getattr(
+                    self, "_request_id", None,
+                )
                logger.info(
                    f"Fallback LLM initialized from global settings: "
                    f"{settings.FALLBACK_LLM_PROVIDER}/{settings.FALLBACK_LLM_NAME}"
--- a/application/llm/google_ai.py
+++ b/application/llm/google_ai.py
@@ -6,6 +6,7 @@ from google.genai import types
 from application.core.settings import settings

 from application.llm.base import BaseLLM
+from application.llm.handlers.google import _decode_thought_signature
 from application.storage.storage_creator import StorageCreator


@@ -258,7 +259,7 @@ class GoogleLLM(BaseLLM):
                        except (_json.JSONDecodeError, TypeError):
                            args = {}
                    cleaned_args = self._remove_null_values(args)
-                    thought_sig = tc.get("thought_signature")
+                    thought_sig = _decode_thought_signature(tc.get("thought_signature"))
                    if thought_sig:
                        parts.append(
                            types.Part(
@@ -322,7 +323,9 @@ class GoogleLLM(BaseLLM):
                                            name=item["function_call"]["name"],
                                            args=cleaned_args,
                                        ),
-                                        thoughtSignature=item["thought_signature"],
+                                        thoughtSignature=_decode_thought_signature(
+                                            item["thought_signature"]
+                                        ),
                                    )
                                )
                            else:
--- a/application/llm/handlers/base.py
+++ b/application/llm/handlers/base.py
@@ -10,6 +10,18 @@ from application.logging import build_stack_data
 logger = logging.getLogger(__name__)


+# Cap the agent tool-call loop. Without this an LLM that keeps
+# requesting more tool calls (preview models, sparse tool results,
+# under-specified prompts) can chain searches indefinitely and the
+# stream never finalises. 25 mirrors Dify's default.
+MAX_TOOL_ITERATIONS = 25
+_FINALIZE_INSTRUCTION = (
+    f"You have made {MAX_TOOL_ITERATIONS} tool calls. Provide a final "
+    "response to the user based on what you have, without making any "
+    "additional tool calls."
+)
+
+
@dataclass
 class ToolCall:
    """Represents a tool/function call from the LLM."""
@@ -624,6 +636,10 @@ class LLMHandler(ABC):
                agent_id=getattr(agent, "agent_id", None),
                model_user_id=compression_user_id,
            )
+            # Side-channel LLM tag — see ``orchestrator.py`` for rationale.
+            compression_llm._token_usage_source = "compression"
+            compression_llm._request_id = getattr(agent, "_request_id", None) \
+                or getattr(getattr(agent, "llm", None), "_request_id", None)

            # Create service without DB persistence capability
            compression_service = CompressionService(
@@ -934,7 +950,9 @@ class LLMHandler(ABC):
        parsed = self.parse_response(response)
        self.llm_calls.append(build_stack_data(agent.llm))

+        iteration = 0
        while parsed.requires_tool_call:
+            iteration += 1
            tool_handler_gen = self.handle_tool_calls(
                agent, parsed.tool_calls, tools_dict, messages
            )
@@ -958,6 +976,25 @@ class LLMHandler(ABC):
                }
                return ""

+            # Cap reached: force one final tool-less call so the stream
+            # always ends with content rather than cutting off.
+            if iteration >= MAX_TOOL_ITERATIONS:
+                logger.warning(
+                    "agent tool loop hit cap (%d); forcing finalize",
+                    MAX_TOOL_ITERATIONS,
+                )
+                messages.append(
+                    {"role": "system", "content": _FINALIZE_INSTRUCTION},
+                )
+                response = agent.llm.gen(
+                    model=getattr(agent.llm, "model_id", None) or agent.model_id,
+                    messages=messages,
+                    tools=None,
+                )
+                parsed = self.parse_response(response)
+                self.llm_calls.append(build_stack_data(agent.llm))
+                break
+
            # ``agent.model_id`` is the registry id (a UUID for BYOM
            # records). Use the LLM's own model_id, which LLMCreator
            # already resolved to the upstream model name. Built-ins:
@@ -973,7 +1010,12 @@ class LLMHandler(ABC):
        return parsed.content

    def handle_streaming(
-        self, agent, response: Any, tools_dict: Dict, messages: List[Dict]
+        self,
+        agent,
+        response: Any,
+        tools_dict: Dict,
+        messages: List[Dict],
+        _iteration: int = 0,
    ) -> Generator:
        """
        Handle streaming response flow.
@@ -1042,6 +1084,9 @@ class LLMHandler(ABC):
                    }
                    return

+                next_iteration = _iteration + 1
+                cap_reached = next_iteration >= MAX_TOOL_ITERATIONS
+
                # Check if context limit was reached during tool execution
                if hasattr(agent, 'context_limit_reached') and agent.context_limit_reached:
                    # Add system message warning about context limit
@@ -1054,16 +1099,32 @@ class LLMHandler(ABC):
                        )
                    })
                    logger.info("Context limit reached - instructing agent to wrap up")
+                elif cap_reached:
+                    logger.warning(
+                        "agent tool loop hit cap (%d); forcing finalize",
+                        MAX_TOOL_ITERATIONS,
+                    )
+                    messages.append(
+                        {"role": "system", "content": _FINALIZE_INSTRUCTION},
+                    )

                # See note above on agent.model_id vs llm.model_id.
                response = agent.llm.gen_stream(
                    model=getattr(agent.llm, "model_id", None) or agent.model_id,
                    messages=messages,
-                    tools=agent.tools if not agent.context_limit_reached else None,
+                    tools=(
+                        None
+                        if cap_reached
+                        or getattr(agent, "context_limit_reached", False)
+                        else agent.tools
+                    ),
                )
                self.llm_calls.append(build_stack_data(agent.llm))

-                yield from self.handle_streaming(agent, response, tools_dict, messages)
+                yield from self.handle_streaming(
+                    agent, response, tools_dict, messages,
+                    _iteration=next_iteration,
+                )
                return
            if parsed.content:
                buffer += parsed.content
--- a/application/llm/handlers/google.py
+++ b/application/llm/handlers/google.py
@@ -1,9 +1,35 @@
+import base64
+import binascii
 import uuid
-from typing import Any, Dict, Generator
+from typing import Any, Dict, Generator, Optional, Union

 from application.llm.handlers.base import LLMHandler, LLMResponse, ToolCall


+def _encode_thought_signature(sig: Optional[Union[bytes, str]]) -> Optional[str]:
+    # Gemini's Python SDK returns thought_signature as raw bytes, but the
+    # field is typed Optional[str] downstream and gets json.dumps'd into
+    # SSE events. Encode once at ingress so callers only ever see a str.
+    if isinstance(sig, bytes):
+        return base64.b64encode(sig).decode("ascii")
+    return sig
+
+
+def _decode_thought_signature(
+    sig: Optional[Union[bytes, str]],
+) -> Optional[Union[bytes, str]]:
+    # Reverse of _encode_thought_signature — Gemini's SDK expects bytes
+    # back when we replay a tool call. ``validate=True`` keeps ASCII
+    # strings that happen to be loosely decodable from being silently
+    # turned into bytes; non-base64 inputs pass through unchanged.
+    if isinstance(sig, str):
+        try:
+            return base64.b64decode(sig.encode("ascii"), validate=True)
+        except (binascii.Error, ValueError):
+            return sig
+    return sig
+
+
 class GoogleLLMHandler(LLMHandler):
    """Handler for Google's GenAI API."""

@@ -23,7 +49,7 @@ class GoogleLLMHandler(LLMHandler):
            for idx, part in enumerate(parts):
                if hasattr(part, "function_call") and part.function_call is not None:
                    has_sig = hasattr(part, "thought_signature") and part.thought_signature is not None
-                    thought_sig = part.thought_signature if has_sig else None
+                    thought_sig = _encode_thought_signature(part.thought_signature) if has_sig else None
                    tool_calls.append(
                        ToolCall(
                            id=str(uuid.uuid4()),
@@ -50,7 +76,7 @@ class GoogleLLMHandler(LLMHandler):
            tool_calls = []
            if hasattr(response, "function_call") and response.function_call is not None:
                has_sig = hasattr(response, "thought_signature") and response.thought_signature is not None
-                thought_sig = response.thought_signature if has_sig else None
+                thought_sig = _encode_thought_signature(response.thought_signature) if has_sig else None
                tool_calls.append(
                    ToolCall(
                        id=str(uuid.uuid4()),
@@ -70,8 +96,15 @@ class GoogleLLMHandler(LLMHandler):
        """Create a tool result message in the standard internal format."""
        import json as _json

+        from application.storage.db.serialization import PGNativeJSONEncoder
+
+        # PostgresTool results commonly include PG-native types
+        # (datetime / UUID / Decimal / bytea) when SELECT touches
+        # timestamptz / numeric / uuid / bytea columns. The shared
+        # encoder handles all five — bytes get base64 (lossless) instead
+        # of the ``str(b'...')`` repr that ``default=str`` would emit.
        content = (
-            _json.dumps(result)
+            _json.dumps(result, cls=PGNativeJSONEncoder)
            if not isinstance(result, str)
            else result
        )
--- a/application/llm/handlers/openai.py
+++ b/application/llm/handlers/openai.py
@@ -40,8 +40,15 @@ class OpenAILLMHandler(LLMHandler):
        """Create a tool result message in the standard internal format."""
        import json as _json

+        from application.storage.db.serialization import PGNativeJSONEncoder
+
+        # PostgresTool results commonly include PG-native types
+        # (datetime / UUID / Decimal / bytea) when SELECT touches
+        # timestamptz / numeric / uuid / bytea columns. The shared
+        # encoder handles all five — bytes get base64 (lossless) instead
+        # of the ``str(b'...')`` repr that ``default=str`` would emit.
        content = (
-            _json.dumps(result)
+            _json.dumps(result, cls=PGNativeJSONEncoder)
            if not isinstance(result, str)
            else result
        )
--- a/application/parser/embedding_pipeline.py
+++ b/application/parser/embedding_pipeline.py
@@ -1,12 +1,27 @@
 import os
 import logging
-from typing import List, Any
+from typing import Any, List, Optional
 from retry import retry
 from tqdm import tqdm
 from application.core.settings import settings
+from application.storage.db.repositories.ingest_chunk_progress import (
+    IngestChunkProgressRepository,
+)
+from application.storage.db.session import db_session
 from application.vectorstore.vector_creator import VectorCreator


+class EmbeddingPipelineError(Exception):
+    """Raised when the per-chunk embed loop produces a partial index.
+
+    Escapes into Celery's ``autoretry_for`` so a transient cause (rate
+    limit, network blip) gets another shot. The chunk-progress
+    checkpoint makes retries cheap — only the failed-and-after chunks
+    re-run. After ``MAX_TASK_ATTEMPTS`` the poison-loop guard in
+    ``with_idempotency`` finalises the row as ``failed``.
+    """
+
+
 def sanitize_content(content: str) -> str:
    """
    Remove NUL characters that can cause vector store ingestion to fail.
@@ -22,7 +37,11 @@ def sanitize_content(content: str) -> str:
    return content.replace('\x00', '')


-@retry(tries=10, delay=60)
+# Per-chunk inline retry. Aggressive defaults (tries=10, delay=60) blocked
+# the loop for up to 9 min per chunk and wedged the heartbeat: lower the
+# tail so a transient failure fails-fast and the chunk-progress checkpoint
+# resumes cleanly on next dispatch.
+@retry(tries=3, delay=5, backoff=2)
 def add_text_to_store_with_retry(store: Any, doc: Any, source_id: str) -> None:
    """Add a document's text and metadata to the vector store with retry logic.
    
@@ -45,21 +64,119 @@ def add_text_to_store_with_retry(store: Any, doc: Any, source_id: str) -> None:
        raise


-def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str, task_status: Any) -> None:
+def _init_progress_and_resume_index(
+    source_id: str, total_chunks: int, attempt_id: Optional[str],
+) -> int:
+    """Upsert the progress row and return the next chunk index to embed.
+
+    The repository's upsert preserves ``last_index`` only when the
+    incoming ``attempt_id`` matches the stored one (a Celery autoretry
+    of the same task). On a fresh attempt — including any caller that
+    doesn't pass an ``attempt_id``, e.g. legacy code or tests — the
+    row's checkpoint is reset so the loop starts from chunk 0. This
+    is what prevents a completed checkpoint from any prior run
+    silently no-op'ing the next sync/reingest.
+
+    Best-effort: a DB outage falls back to ``0`` (fresh run from
+    chunk 0). The embed loop's own re-raise still ensures partial
+    runs don't get cached as complete.
+    """
+    try:
+        with db_session() as conn:
+            progress = IngestChunkProgressRepository(conn).init_progress(
+                source_id, total_chunks, attempt_id,
+            )
+    except Exception as e:
+        logging.warning(
+            f"Could not init ingest progress for {source_id}: {e}",
+            exc_info=True,
+        )
+        return 0
+    if not progress:
+        return 0
+    last_index = progress.get("last_index", -1)
+    if last_index is None or last_index < 0:
+        return 0
+    return int(last_index) + 1
+
+
+def _record_progress(source_id: str, last_index: int, embedded_chunks: int) -> None:
+    """Best-effort checkpoint after each chunk; logged but never raised."""
+    try:
+        with db_session() as conn:
+            IngestChunkProgressRepository(conn).record_chunk(
+                source_id, last_index=last_index, embedded_chunks=embedded_chunks
+            )
+    except Exception as e:
+        logging.warning(
+            f"Could not record ingest progress for {source_id}: {e}", exc_info=True
+        )
+
+
+def assert_index_complete(source_id: str) -> None:
+    """Raise ``EmbeddingPipelineError`` if ``ingest_chunk_progress``
+    shows a partial embed for ``source_id``.
+
+    Defense-in-depth tripwire that workers run after
+    ``embed_and_store_documents`` to catch any future swallow path
+    that bypasses the function's own re-raise — the chunk-progress
+    row is the authoritative record of how many chunks landed.
+    No-op when no row exists (zero-doc validation raised before init,
+    or progress repo was unreachable).
+    """
+    try:
+        with db_session() as conn:
+            progress = IngestChunkProgressRepository(conn).get_progress(source_id)
+    except Exception as e:
+        logging.warning(
+            f"assert_index_complete: progress lookup failed for "
+            f"{source_id}: {e}",
+            exc_info=True,
+        )
+        return
+    if not progress:
+        return
+    embedded = int(progress.get("embedded_chunks") or 0)
+    total = int(progress.get("total_chunks") or 0)
+    if embedded < total:
+        raise EmbeddingPipelineError(
+            f"partial index for source {source_id}: "
+            f"{embedded}/{total} chunks embedded"
+        )
+
+
+def embed_and_store_documents(
+    docs: List[Any],
+    folder_name: str,
+    source_id: str,
+    task_status: Any,
+    *,
+    attempt_id: Optional[str] = None,
+) -> None:
    """Embeds documents and stores them in a vector store.

+    Resumable across Celery autoretries of the *same* task: when
+    ``attempt_id`` matches the stored checkpoint's ``attempt_id``,
+    the loop resumes from ``last_index + 1``. A different
+    ``attempt_id`` (a fresh sync / reingest invocation) resets the
+    checkpoint so the index is rebuilt from chunk 0 — this is what
+    keeps a completed checkpoint from poisoning the next sync.
+
    Args:
        docs: List of documents to be embedded and stored.
        folder_name: Directory to save the vector store.
        source_id: Unique identifier for the source.
        task_status: Task state manager for progress updates.
+        attempt_id: Stable id of the current task invocation,
+            typically ``self.request.id`` from the Celery task body.
+            ``None`` is treated as a fresh attempt every time.

    Returns:
        None
-        
+
    Raises:
        OSError: If unable to create folder or save vector store.
-        Exception: If vector store creation or document embedding fails.
+        EmbeddingPipelineError: If a chunk fails after retries.
    """
    # Ensure the folder exists
    if not os.path.exists(folder_name):
@@ -69,33 +186,77 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,
    if not docs:
        raise ValueError("No documents to embed - check file format and extension")

+    total_docs = len(docs)
+    # Atomic upsert that preserves checkpoint state on attempt-id match
+    # (autoretry of same task) and resets it on mismatch (fresh sync /
+    # reingest). Returns the new resume index — 0 means "start fresh".
+    resume_index = _init_progress_and_resume_index(
+        source_id, total_docs, attempt_id,
+    )
+    is_resume = resume_index > 0
+
    # Initialize vector store
    if settings.VECTOR_STORE == "faiss":
-        docs_init = [docs.pop(0)]
-        store = VectorCreator.create_vectorstore(
-            settings.VECTOR_STORE,
-            docs_init=docs_init,
-            source_id=source_id,
-            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
-        )
+        if is_resume:
+            # Load the existing FAISS index from storage so chunks
+            # already embedded by the prior attempt survive the
+            # save_local rewrite at the end of this run.
+            store = VectorCreator.create_vectorstore(
+                settings.VECTOR_STORE,
+                source_id=source_id,
+                embeddings_key=os.getenv("EMBEDDINGS_KEY"),
+            )
+            loop_start = resume_index
+        else:
+            # FAISS requires at least one doc to construct the store;
+            # seed with ``docs[0]`` and let the loop pick up at index 1.
+            store = VectorCreator.create_vectorstore(
+                settings.VECTOR_STORE,
+                docs_init=[docs[0]],
+                source_id=source_id,
+                embeddings_key=os.getenv("EMBEDDINGS_KEY"),
+            )
+            # Record the seeded chunk so single-doc ingests don't fail
+            # ``assert_index_complete`` — the loop never runs for
+            # ``total_docs == 1`` and would otherwise leave
+            # ``embedded_chunks`` at 0 / ``last_index`` at -1. The loop
+            # body's per-iteration ``_record_progress`` overshoots
+            # correctly for multi-chunk runs (counts seed + iterations),
+            # so writing this checkpoint up-front is a no-op for those.
+            _record_progress(source_id, last_index=0, embedded_chunks=1)
+            loop_start = 1
    else:
        store = VectorCreator.create_vectorstore(
            settings.VECTOR_STORE,
            source_id=source_id,
            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
        )
-        store.delete_index()
+        # Only wipe the index on a fresh run — a resume must keep the
+        # chunks that earlier attempts already embedded.
+        if not is_resume:
+            store.delete_index()
+        loop_start = resume_index

-    total_docs = len(docs)
+    if is_resume and loop_start >= total_docs:
+        # Nothing left to do; the loop runs zero iterations and
+        # downstream finalize logic still executes. This is only
+        # reachable on a same-attempt retry of a task whose previous
+        # attempt finished — typically a Celery acks_late redelivery
+        # after the task already returned. The ``assert_index_complete``
+        # tripwire still validates ``embedded == total`` afterwards.
+        loop_start = total_docs

    # Process and embed documents
-    for idx, doc in tqdm(
-        enumerate(docs),
+    chunk_error: Exception | None = None
+    failed_idx: int | None = None
+    for idx in tqdm(
+        range(loop_start, total_docs),
        desc="Embedding 🦖",
        unit="docs",
-        total=total_docs,
+        total=total_docs - loop_start,
        bar_format="{l_bar}{bar}| Time Left: {remaining}",
    ):
+        doc = docs[idx]
        try:
            # Update task status for progress tracking
            progress = int(((idx + 1) / total_docs) * 100)
@@ -103,7 +264,10 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,

            # Add document to vector store
            add_text_to_store_with_retry(store, doc, source_id)
+            _record_progress(source_id, last_index=idx, embedded_chunks=idx + 1)
        except Exception as e:
+            chunk_error = e
+            failed_idx = idx
            logging.error(f"Error embedding document {idx}: {e}", exc_info=True)
            logging.info(f"Saving progress at document {idx} out of {total_docs}")
            try:
@@ -124,3 +288,16 @@ def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str,
            raise OSError(f"Unable to save vector store to {folder_name}: {e}") from e
    else:
        logging.info("Vector store saved successfully.")
+
+    # Re-raise after the partial save: the chunks that *did* embed are
+    # flushed to disk and recorded in ``ingest_chunk_progress``, so a
+    # Celery autoretry resumes via ``_read_resume_index`` and only
+    # re-runs the failed-and-after chunks. Without the raise, the
+    # task body returns success and ``with_idempotency`` finalises
+    # ``task_dedup`` as ``completed`` for a partial index — poisoning
+    # the cache for 24h.
+    if chunk_error is not None:
+        raise EmbeddingPipelineError(
+            f"embed failure at chunk {failed_idx}/{total_docs} "
+            f"for source {source_id}"
+        ) from chunk_error
--- a/application/retriever/classic_rag.py
+++ b/application/retriever/classic_rag.py
@@ -60,6 +60,9 @@ class ClassicRAG(BaseRetriever):
            agent_id=self.agent_id,
            model_user_id=self.model_user_id,
        )
+        # Query-rephrase LLM is a side channel — tag it so its rows
+        # land as ``source='rag_condense'`` in cost-attribution.
+        self.llm._token_usage_source = "rag_condense"

        if "active_docs" in source and source["active_docs"] is not None:
            if isinstance(source["active_docs"], list):
--- a/application/storage/db/base_repository.py
+++ b/application/storage/db/base_repository.py
@@ -11,6 +11,8 @@ import re
 from typing import Any, Mapping
 from uuid import UUID

+from application.storage.db.serialization import coerce_pg_native
+

 _UUID_RE = re.compile(
    r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
@@ -34,12 +36,17 @@ def looks_like_uuid(value: Any) -> bool:


 def row_to_dict(row: Any) -> dict:
-    """Convert a SQLAlchemy ``Row`` to a plain dict with Mongo-compatible ids.
+    """Convert a SQLAlchemy ``Row`` to a plain JSON-safe dict.

-    During the migration window, API responses and downstream code still
-    expect a string ``_id`` field (matching the Mongo shape). This helper
-    normalizes UUID columns to strings and emits both ``id`` and ``_id`` so
-    existing serializers keep working unchanged.
+    Normalises PG-native types at the SELECT boundary: UUID, datetime,
+    date, Decimal, and bytes are coerced to JSON-safe forms via
+    :func:`coerce_pg_native`. Downstream serialisation (SSE events,
+    JSONB writes, API responses) becomes safe by default — repository
+    consumers no longer need to know that PG returns a different type
+    set than Mongo did.
+
+    Also emits ``_id`` alongside ``id`` for the duration of the Mongo→PG
+    cutover so legacy serializers expecting Mongo's shape keep working.

    Args:
        row: A SQLAlchemy ``Row`` object, or ``None``.
@@ -52,10 +59,9 @@ def row_to_dict(row: Any) -> dict:

    # Row has a ``._mapping`` attribute exposing a MappingProxy view.
    mapping: Mapping[str, Any] = row._mapping  # type: ignore[attr-defined]
-    out = dict(mapping)
+    out = coerce_pg_native(dict(mapping))

    if "id" in out and out["id"] is not None:
-        out["id"] = str(out["id"]) if isinstance(out["id"], UUID) else out["id"]
        out["_id"] = out["id"]

    return out
--- a/application/storage/db/models.py
+++ b/application/storage/db/models.py
@@ -91,6 +91,16 @@ token_usage_table = Table(
    Column("prompt_tokens", Integer, nullable=False, server_default="0"),
    Column("generated_tokens", Integer, nullable=False, server_default="0"),
    Column("timestamp", DateTime(timezone=True), nullable=False, server_default=func.now()),
+    # Added in ``0004_durability_foundation``. Distinguishes
+    # ``agent_stream`` (primary completion) from side-channel inserts
+    # (``title`` / ``compression`` / ``rag_condense`` / ``fallback``)
+    # so cost attribution dashboards can group by call source.
+    Column("source", Text, nullable=False, server_default="agent_stream"),
+    # Added in ``0005_token_usage_request_id``. Stream-scoped UUID stamped
+    # on the agent's primary LLM so multi-call agent runs (which produce
+    # N rows) count as a single request via DISTINCT in the repository
+    # query. NULL on side-channel sources by design.
+    Column("request_id", Text),
 )

 user_logs_table = Table(
@@ -345,6 +355,11 @@ conversation_messages_table = Table(
    Column("feedback", JSONB),
    Column("timestamp", DateTime(timezone=True), nullable=False, server_default=func.now()),
    Column("updated_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
+    # Added in 0004_durability_foundation. ``status`` is the WAL state
+    # machine (pending|streaming|complete|failed); ``request_id`` ties a
+    # row to a specific HTTP request for log correlation.
+    Column("status", Text, nullable=False, server_default="complete"),
+    Column("request_id", Text),
    UniqueConstraint("conversation_id", "position", name="conversation_messages_conv_pos_uidx"),
 )

@@ -377,9 +392,101 @@ pending_tool_state_table = Table(
    Column("client_tools", JSONB),
    Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
    Column("expires_at", DateTime(timezone=True), nullable=False),
+    # Added in ``0004_durability_foundation``. ``status`` is the
+    # ``pending|resuming`` claim flag for the resumed-run path;
+    # ``resumed_at`` stamps when ``mark_resuming`` flipped the row so
+    # the cleanup janitor can revert stale claims after the grace
+    # window.
+    Column("status", Text, nullable=False, server_default="pending"),
+    Column("resumed_at", DateTime(timezone=True)),
    UniqueConstraint("conversation_id", "user_id", name="pending_tool_state_conv_user_uidx"),
 )

+
+# --- Tier 1 durability foundation (migration 0004) --------------------------
+# CHECK constraints (status enums) and partial indexes are intentionally
+# omitted from these declarations — the DB is the authority. Repositories
+# use raw ``text(...)`` SQL against these tables, not the Core objects.
+
+task_dedup_table = Table(
+    "task_dedup",
+    metadata,
+    Column("idempotency_key", Text, primary_key=True),
+    Column("task_name", Text, nullable=False),
+    Column("task_id", Text, nullable=False),
+    Column("result_json", JSONB),
+    # CHECK (status IN ('pending', 'completed', 'failed')) lives in 0004.
+    Column("status", Text, nullable=False),
+    # Bumped each time the per-Celery-task wrapper re-enters; the
+    # poison-loop guard (``MAX_TASK_ATTEMPTS=5``) refuses to run fn once
+    # this exceeds the threshold.
+    Column("attempt_count", Integer, nullable=False, server_default="0"),
+    Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
+    # Added in ``0006_idempotency_lease``. Per-invocation random id
+    # written by the wrapper at lease claim; refreshed every 30 s by a
+    # heartbeat thread. Other workers seeing a fresh lease (NOT NULL
+    # AND ``lease_expires_at > now()``) refuse to run the task body.
+    Column("lease_owner_id", Text),
+    Column("lease_expires_at", DateTime(timezone=True)),
+)
+
+webhook_dedup_table = Table(
+    "webhook_dedup",
+    metadata,
+    Column("idempotency_key", Text, primary_key=True),
+    Column("agent_id", UUID(as_uuid=True), nullable=False),
+    Column("task_id", Text, nullable=False),
+    Column("response_json", JSONB),
+    Column("created_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
+)
+
+# Three-phase tool-call journal: ``proposed → executed → confirmed``
+# (terminal: ``failed``; ``compensated`` is grandfathered in the CHECK
+# from migration 0004 but no code writes it). The reconciler sweeps
+# stuck rows via the partial ``tool_call_attempts_pending_ts_idx``.
+tool_call_attempts_table = Table(
+    "tool_call_attempts",
+    metadata,
+    Column("call_id", Text, primary_key=True),
+    # ON DELETE SET NULL preserves the journal even after the parent
+    # message is deleted — useful for cost-attribution / compliance.
+    Column(
+        "message_id",
+        UUID(as_uuid=True),
+        ForeignKey("conversation_messages.id", ondelete="SET NULL"),
+    ),
+    Column("tool_id", UUID(as_uuid=True)),
+    Column("tool_name", Text, nullable=False),
+    Column("action_name", Text, nullable=False),
+    Column("arguments", JSONB, nullable=False),
+    Column("result", JSONB),
+    Column("error", Text),
+    # CHECK (status IN ('proposed', 'executed', 'confirmed',
+    # 'compensated', 'failed')) lives in 0004.
+    Column("status", Text, nullable=False),
+    Column("attempted_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
+    Column("updated_at", DateTime(timezone=True), nullable=False, server_default=func.now()),
+)
+
+# Per-source ingest checkpoint. Heartbeat thread bumps ``last_updated``
+# every 30s while a worker embeds; the reconciler escalates when it
+# stops ticking.
+ingest_chunk_progress_table = Table(
+    "ingest_chunk_progress",
+    metadata,
+    Column("source_id", UUID(as_uuid=True), primary_key=True),
+    Column("total_chunks", Integer, nullable=False),
+    Column("embedded_chunks", Integer, nullable=False, server_default="0"),
+    Column("last_index", Integer, nullable=False, server_default="-1"),
+    Column("last_updated", DateTime(timezone=True), nullable=False, server_default=func.now()),
+    # Added in ``0005_ingest_attempt_id``. Stamped from
+    # ``self.request.id`` (Celery's stable task id) so a retry of the
+    # same task resumes from the checkpoint, but a separate invocation
+    # (manual reingest, scheduled sync) resets to a clean re-index.
+    Column("attempt_id", Text),
+)
+
+
 workflows_table = Table(
    "workflows",
    metadata,
--- a/application/storage/db/repositories/connector_sessions.py
+++ b/application/storage/db/repositories/connector_sessions.py
@@ -25,6 +25,7 @@ from typing import Any, Optional
 from sqlalchemy import Connection, text

 from application.storage.db.base_repository import row_to_dict
+from application.storage.db.serialization import PGNativeJSONEncoder


 _UPDATABLE_SCALARS = {
@@ -36,7 +37,7 @@ _UPDATABLE_JSONB = {"session_data", "token_info"}
 def _jsonb(value: Any) -> Any:
    if value is None:
        return None
-    return json.dumps(value, default=str)
+    return json.dumps(value, cls=PGNativeJSONEncoder)


 class ConnectorSessionsRepository:
--- a/application/storage/db/repositories/conversations.py
+++ b/application/storage/db/repositories/conversations.py
@@ -22,6 +22,7 @@ from sqlalchemy.dialects.postgresql import insert as pg_insert

 from application.storage.db.base_repository import looks_like_uuid, row_to_dict
 from application.storage.db.models import conversations_table, conversation_messages_table
+from application.storage.db.serialization import PGNativeJSONEncoder


 def _message_row_to_dict(row) -> dict:
@@ -244,6 +245,57 @@ class ConversationsRepository:
        )
        return [row_to_dict(r) for r in result.fetchall()]

+    def search_for_user(
+        self, user_id: str, query: str, limit: int = 30,
+    ) -> list[dict]:
+        """Search a user's conversations by name or message content.
+
+        Same visibility filter as :meth:`list_for_user`. Matches against
+        ``conversations.name`` or any of the conversation's messages'
+        ``prompt`` / ``response`` columns (case-insensitive substring).
+
+        Each returned row includes ``match_field`` (one of ``name``,
+        ``prompt``, ``response``) and ``match_text`` (the full text of the
+        first matching field, ``name`` taking precedence over messages,
+        ``prompt`` over ``response``) so callers can render a snippet.
+        """
+        if not query:
+            return []
+        result = self._conn.execute(
+            text(
+                "SELECT c.*, mt.match_field, mt.match_text "
+                "FROM conversations c "
+                "JOIN LATERAL ( "
+                "  SELECT field AS match_field, txt AS match_text "
+                "  FROM ( "
+                "    SELECT 'name'::text AS field, c.name AS txt, 0 AS prio "
+                "    WHERE c.name ILIKE :pattern "
+                "    UNION ALL "
+                "    SELECT 'prompt'::text, m.prompt, 1 "
+                "    FROM conversation_messages m "
+                "    WHERE m.conversation_id = c.id "
+                "      AND m.prompt ILIKE :pattern "
+                "    UNION ALL "
+                "    SELECT 'response'::text, m.response, 2 "
+                "    FROM conversation_messages m "
+                "    WHERE m.conversation_id = c.id "
+                "      AND m.response ILIKE :pattern "
+                "  ) s "
+                "  ORDER BY prio "
+                "  LIMIT 1 "
+                ") mt ON TRUE "
+                "WHERE c.user_id = :user_id "
+                "AND (c.api_key IS NULL OR c.agent_id IS NOT NULL) "
+                "ORDER BY c.date DESC LIMIT :limit"
+            ),
+            {
+                "user_id": user_id,
+                "pattern": f"%{query}%",
+                "limit": limit,
+            },
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
    def rename(self, conversation_id: str, user_id: str, name: str) -> bool:
        # Shape-gate so a non-UUID id (legacy Mongo ObjectId still floating
        # around in client-side state during the cutover) never reaches the
@@ -452,7 +504,7 @@ class ConversationsRepository:
            ),
            {
                "id": conversation_id,
-                "point": json.dumps(point, default=str),
+                "point": json.dumps(point, cls=PGNativeJSONEncoder),
                "max_points": int(max_points),
            },
        )
@@ -632,6 +684,200 @@ class ConversationsRepository:
        result = self._conn.execute(text(sql), params)
        return result.rowcount > 0

+    def reserve_message(
+        self,
+        conversation_id: str,
+        *,
+        prompt: str,
+        placeholder_response: str,
+        request_id: str | None = None,
+        status: str = "pending",
+        attachments: list[str] | None = None,
+        model_id: str | None = None,
+        metadata: dict | None = None,
+    ) -> dict:
+        """Pre-persist a placeholder assistant message before the LLM call."""
+        self._conn.execute(
+            text(
+                "SELECT id FROM conversations "
+                "WHERE id = CAST(:conv_id AS uuid) FOR UPDATE"
+            ),
+            {"conv_id": conversation_id},
+        )
+        next_pos = self._conn.execute(
+            text(
+                "SELECT COALESCE(MAX(position), -1) + 1 AS next_pos "
+                "FROM conversation_messages "
+                "WHERE conversation_id = CAST(:conv_id AS uuid)"
+            ),
+            {"conv_id": conversation_id},
+        ).scalar()
+
+        values = {
+            "conversation_id": conversation_id,
+            "position": next_pos,
+            "prompt": prompt,
+            "response": placeholder_response,
+            "status": status,
+            "request_id": request_id,
+            "model_id": model_id,
+            "message_metadata": metadata or {},
+        }
+        if attachments:
+            resolved = self._resolve_attachment_refs(
+                [str(a) for a in attachments],
+            )
+            if resolved:
+                values["attachments"] = resolved
+
+        stmt = (
+            pg_insert(conversation_messages_table)
+            .values(**values)
+            .returning(conversation_messages_table)
+        )
+        result = self._conn.execute(stmt)
+        self._conn.execute(
+            text(
+                "UPDATE conversations SET updated_at = now() "
+                "WHERE id = CAST(:id AS uuid)"
+            ),
+            {"id": conversation_id},
+        )
+        return _message_row_to_dict(result.fetchone())
+
+    def update_message_by_id(
+        self, message_id: str, fields: dict,
+        *, only_if_non_terminal: bool = False,
+    ) -> bool:
+        """Update specific fields on a message identified by its UUID.
+
+        ``metadata`` is merged into the existing JSONB rather than
+        overwritten, so a reconciler-set ``reconcile_attempts`` survives
+        a successful late finalize. When ``only_if_non_terminal`` is
+        True, the update is gated so a late finalize cannot retract a
+        reconciler-set ``failed`` (or a prior ``complete``).
+        """
+        if not looks_like_uuid(message_id):
+            return False
+        allowed = {
+            "prompt", "response", "thought", "sources", "tool_calls",
+            "attachments", "model_id", "metadata", "timestamp", "status",
+            "request_id", "feedback", "feedback_timestamp",
+        }
+        filtered = {k: v for k, v in fields.items() if k in allowed}
+        if not filtered:
+            return False
+
+        api_to_col = {"metadata": "message_metadata"}
+
+        set_parts = []
+        params: dict = {"id": message_id}
+        for key, val in filtered.items():
+            col = api_to_col.get(key, key)
+            if key == "metadata":
+                if val is None:
+                    set_parts.append(f"{col} = NULL")
+                else:
+                    set_parts.append(
+                        f"{col} = COALESCE({col}, '{{}}'::jsonb) "
+                        f"|| CAST(:{col} AS jsonb)"
+                    )
+                    params[col] = (
+                        json.dumps(val) if not isinstance(val, str) else val
+                    )
+            elif key in ("sources", "tool_calls", "feedback"):
+                set_parts.append(f"{col} = CAST(:{col} AS jsonb)")
+                if val is None:
+                    params[col] = None
+                else:
+                    params[col] = (
+                        json.dumps(val) if not isinstance(val, str) else val
+                    )
+            elif key == "attachments":
+                set_parts.append(f"{col} = CAST(:{col} AS uuid[])")
+                params[col] = self._resolve_attachment_refs(
+                    [str(a) for a in val] if val else [],
+                )
+            else:
+                set_parts.append(f"{col} = :{col}")
+                params[col] = val
+
+        set_parts.append("updated_at = now()")
+        where_clauses = ["id = CAST(:id AS uuid)"]
+        if only_if_non_terminal:
+            where_clauses.append("status NOT IN ('complete', 'failed')")
+        sql = (
+            f"UPDATE conversation_messages SET {', '.join(set_parts)} "
+            f"WHERE {' AND '.join(where_clauses)}"
+        )
+        result = self._conn.execute(text(sql), params)
+        return result.rowcount > 0
+
+    def update_message_status(
+        self, message_id: str, status: str,
+    ) -> bool:
+        """Cheap status-only transition (e.g. pending → streaming).
+
+        Only flips non-terminal rows: a reconciler-set ``failed`` row
+        stays put so the late streaming chunk doesn't silently retract
+        the alert.
+        """
+        if not looks_like_uuid(message_id):
+            return False
+        result = self._conn.execute(
+            text(
+                "UPDATE conversation_messages SET status = :status, "
+                "updated_at = now() "
+                "WHERE id = CAST(:id AS uuid) "
+                "AND status NOT IN ('complete', 'failed')"
+            ),
+            {"id": message_id, "status": status},
+        )
+        return result.rowcount > 0
+
+    def heartbeat_message(self, message_id: str) -> bool:
+        """Stamp ``message_metadata.last_heartbeat_at`` with ``clock_timestamp()``.
+
+        The reconciler's staleness check uses ``GREATEST(timestamp,
+        last_heartbeat_at)``, so this call extends a long-running
+        stream's effective freshness without touching ``timestamp`` (the
+        creation time, used for history sort) or ``status`` (the WAL
+        marker). Skips terminal rows so a late heartbeat can't silently
+        retract a reconciler-set ``failed``.
+        """
+        if not looks_like_uuid(message_id):
+            return False
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE conversation_messages
+                SET message_metadata = jsonb_set(
+                    COALESCE(message_metadata, '{}'::jsonb),
+                    '{last_heartbeat_at}',
+                    to_jsonb(clock_timestamp())
+                )
+                WHERE id = CAST(:id AS uuid)
+                  AND status NOT IN ('complete', 'failed')
+                """
+            ),
+            {"id": message_id},
+        )
+        return result.rowcount > 0
+
+    def confirm_executed_tool_calls(self, message_id: str) -> int:
+        """Flip ``tool_call_attempts.status='executed' → 'confirmed'`` for the message."""
+        if not looks_like_uuid(message_id):
+            return 0
+        result = self._conn.execute(
+            text(
+                "UPDATE tool_call_attempts SET status = 'confirmed', "
+                "updated_at = now() "
+                "WHERE message_id = CAST(:mid AS uuid) AND status = 'executed'"
+            ),
+            {"mid": message_id},
+        )
+        return result.rowcount or 0
+
    def truncate_after(self, conversation_id: str, keep_up_to: int) -> int:
        """Delete messages with position > keep_up_to.

--- a/application/storage/db/repositories/idempotency.py
+++ b/application/storage/db/repositories/idempotency.py
@@ -0,0 +1,346 @@
+"""Repository for ``webhook_dedup`` and ``task_dedup``; 24h TTL enforced at read."""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Optional
+
+from sqlalchemy import Connection, text
+
+from application.storage.db.base_repository import row_to_dict
+from application.storage.db.serialization import PGNativeJSONEncoder
+
+# 24h TTL is the contract surfaced in the upload/webhook docstrings; the
+# read filters and the stale-row replacement predicate must agree, or the
+# upsert can fall into a window where the row is "fresh" to the writer
+# but "expired" to the reader (or vice versa). Keep one constant so any
+# future change moves both directions in lockstep.
+DEDUP_TTL_INTERVAL = "24 hours"
+
+
+def _jsonb(value: Any) -> Any:
+    if value is None:
+        return None
+    return json.dumps(value, cls=PGNativeJSONEncoder)
+
+
+class IdempotencyRepository:
+    def __init__(self, conn: Connection) -> None:
+        self._conn = conn
+
+    # --- webhook_dedup -----------------------------------------------------
+
+    def get_webhook(self, key: str) -> Optional[dict]:
+        """Return the cached webhook row for ``key`` if still within the 24h window."""
+        row = self._conn.execute(
+            text(
+                """
+                SELECT * FROM webhook_dedup
+                WHERE idempotency_key = :key
+                  AND created_at > now() - CAST(:ttl AS interval)
+                """
+            ),
+            {"key": key, "ttl": DEDUP_TTL_INTERVAL},
+        ).fetchone()
+        return row_to_dict(row) if row is not None else None
+
+    def record_webhook(
+        self,
+        key: str,
+        agent_id: str,
+        task_id: str,
+        response_json: dict,
+    ) -> Optional[dict]:
+        """Insert a webhook dedup row; return None if another writer raced and won.
+
+        ``ON CONFLICT`` replaces an existing row only when its ``created_at``
+        is past TTL — atomic stale-row recycling under the row lock. A
+        within-TTL conflict yields no row; the caller resolves it via
+        :meth:`get_webhook`.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                INSERT INTO webhook_dedup (
+                    idempotency_key, agent_id, task_id, response_json
+                )
+                VALUES (
+                    :key, CAST(:agent_id AS uuid), :task_id,
+                    CAST(:response_json AS jsonb)
+                )
+                ON CONFLICT (idempotency_key) DO UPDATE
+                   SET agent_id      = EXCLUDED.agent_id,
+                       task_id       = EXCLUDED.task_id,
+                       response_json = EXCLUDED.response_json,
+                       created_at    = now()
+                   WHERE webhook_dedup.created_at
+                         <= now() - CAST(:ttl AS interval)
+                RETURNING *
+                """
+            ),
+            {
+                "key": key,
+                "agent_id": agent_id,
+                "task_id": task_id,
+                "response_json": _jsonb(response_json),
+                "ttl": DEDUP_TTL_INTERVAL,
+            },
+        )
+        row = result.fetchone()
+        return row_to_dict(row) if row is not None else None
+
+    # --- task_dedup --------------------------------------------------------
+
+    def get_task(self, key: str) -> Optional[dict]:
+        """Return the cached task row for ``key`` if still within the 24h window."""
+        row = self._conn.execute(
+            text(
+                """
+                SELECT * FROM task_dedup
+                WHERE idempotency_key = :key
+                  AND created_at > now() - CAST(:ttl AS interval)
+                """
+            ),
+            {"key": key, "ttl": DEDUP_TTL_INTERVAL},
+        ).fetchone()
+        return row_to_dict(row) if row is not None else None
+
+    def claim_task(
+        self,
+        key: str,
+        task_name: str,
+        task_id: str,
+    ) -> Optional[dict]:
+        """Claim ``key`` for this task. Returns the inserted row, or None if
+        another writer raced and won. The HTTP entry must call this *before*
+        ``.delay()`` so only the winner enqueues the Celery task.
+
+        ``ON CONFLICT`` replaces an existing row in two cases:
+
+        - **status='failed'**: the worker's poison-loop guard or the
+          reconciler's stuck-pending sweep finalised the prior attempt
+          as failed. Both explicitly intend a same-key retry to re-run
+          (see ``run_reconciliation`` Q5 docstring) — letting the row
+          block for 24 h would silently undo that intent.
+        - **created_at past TTL**: a stale claim from any status no
+          longer represents a meaningful dedup signal.
+
+        ``status='completed'`` rows still block within TTL — that's the
+        cached-success contract callers rely on. ``status='pending'``
+        rows still block within TTL so concurrent same-key requests
+        collapse onto the in-flight task. Result/attempt fields are
+        reset to their fresh-claim defaults during replacement.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                INSERT INTO task_dedup (
+                    idempotency_key, task_name, task_id, result_json, status
+                )
+                VALUES (
+                    :key, :task_name, :task_id, NULL, 'pending'
+                )
+                ON CONFLICT (idempotency_key) DO UPDATE
+                   SET task_name     = EXCLUDED.task_name,
+                       task_id       = EXCLUDED.task_id,
+                       result_json   = NULL,
+                       status        = 'pending',
+                       attempt_count = 0,
+                       created_at    = now()
+                   WHERE task_dedup.status = 'failed'
+                      OR task_dedup.created_at
+                         <= now() - CAST(:ttl AS interval)
+                RETURNING *
+                """
+            ),
+            {
+                "key": key,
+                "task_name": task_name,
+                "task_id": task_id,
+                "ttl": DEDUP_TTL_INTERVAL,
+            },
+        )
+        row = result.fetchone()
+        return row_to_dict(row) if row is not None else None
+
+    def try_claim_lease(
+        self,
+        key: str,
+        task_name: str,
+        task_id: str,
+        owner_id: str,
+        ttl_seconds: int = 60,
+    ) -> Optional[int]:
+        """Atomically claim the running lease for ``key``.
+
+        Returns the new ``attempt_count`` if this caller now owns the
+        lease (fresh insert OR existing row whose lease was empty/expired),
+        or ``None`` if a different worker holds a live lease.
+
+        The conflict path also bumps ``attempt_count`` so the
+        poison-loop guard in :func:`with_idempotency` can fire after
+        :data:`MAX_TASK_ATTEMPTS` reclaims. ``status='completed'`` rows
+        are deliberately untouched — :func:`_lookup_completed` is the
+        cache short-circuit and runs before this. Uses
+        ``clock_timestamp()`` so a same-transaction refresh actually
+        moves the expiry forward (``now()`` is frozen at txn start).
+        """
+        result = self._conn.execute(
+            text(
+                """
+                INSERT INTO task_dedup (
+                    idempotency_key, task_name, task_id, status, attempt_count,
+                    lease_owner_id, lease_expires_at
+                ) VALUES (
+                    :key, :task_name, :task_id, 'pending', 1,
+                    :owner,
+                    clock_timestamp() + make_interval(secs => :ttl)
+                )
+                ON CONFLICT (idempotency_key) DO UPDATE
+                   SET attempt_count    = task_dedup.attempt_count + 1,
+                       task_name        = EXCLUDED.task_name,
+                       lease_owner_id   = EXCLUDED.lease_owner_id,
+                       lease_expires_at = EXCLUDED.lease_expires_at
+                   WHERE task_dedup.status <> 'completed'
+                     AND (task_dedup.lease_expires_at IS NULL
+                          OR task_dedup.lease_expires_at <= clock_timestamp())
+                RETURNING attempt_count
+                """
+            ),
+            {
+                "key": key,
+                "task_name": task_name,
+                "task_id": task_id,
+                "owner": owner_id,
+                "ttl": int(ttl_seconds),
+            },
+        )
+        row = result.fetchone()
+        return int(row[0]) if row is not None else None
+
+    def refresh_lease(
+        self,
+        key: str,
+        owner_id: str,
+        ttl_seconds: int = 60,
+    ) -> bool:
+        """Bump ``lease_expires_at`` if this caller still owns the lease.
+
+        Returns False when ownership was lost (lease stolen by another
+        worker after expiry, or row finalised). The heartbeat thread
+        logs that as a warning but doesn't try to abort the running
+        task — at-most-one-worker is bounded by ``ttl_seconds``, the
+        damage from a brief overlap window is unavoidable in this case.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE task_dedup
+                SET lease_expires_at =
+                        clock_timestamp() + make_interval(secs => :ttl)
+                WHERE idempotency_key = :key
+                  AND lease_owner_id = :owner
+                  AND status = 'pending'
+                """
+            ),
+            {
+                "key": key,
+                "owner": owner_id,
+                "ttl": int(ttl_seconds),
+            },
+        )
+        return result.rowcount > 0
+
+    def release_lease(self, key: str, owner_id: str) -> bool:
+        """Clear ``lease_owner_id`` / ``lease_expires_at`` on the
+        wrapper's exception path so Celery's autoretry_for doesn't have
+        to wait the full ``ttl_seconds`` before the next worker can
+        re-claim. No-op if a different worker has since taken over the
+        lease — that case is benign (we'd just be acknowledging we
+        weren't the owner anymore).
+        """
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE task_dedup
+                SET lease_owner_id   = NULL,
+                    lease_expires_at = NULL
+                WHERE idempotency_key = :key
+                  AND lease_owner_id = :owner
+                  AND status = 'pending'
+                """
+            ),
+            {"key": key, "owner": owner_id},
+        )
+        return result.rowcount > 0
+
+    def finalize_task(
+        self,
+        key: str,
+        *,
+        result_json: Optional[dict],
+        status: str,
+    ) -> bool:
+        """Promote ``status='pending'`` → ``completed|failed`` with the
+        recorded result. Also clears the lease columns so a stale
+        ``lease_expires_at`` doesn't show up in operator dashboards.
+        No-op if the row is already terminal — preserves the first
+        writer's outcome on a crash + retry.
+        """
+        if status not in ("completed", "failed"):
+            raise ValueError(f"finalize_task: invalid status {status!r}")
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE task_dedup
+                SET status           = :status,
+                    result_json      = CAST(:result_json AS jsonb),
+                    lease_owner_id   = NULL,
+                    lease_expires_at = NULL
+                WHERE idempotency_key = :key
+                  AND status = 'pending'
+                """
+            ),
+            {
+                "key": key,
+                "status": status,
+                "result_json": _jsonb(result_json),
+            },
+        )
+        return result.rowcount > 0
+
+    # --- housekeeping ------------------------------------------------------
+
+    def cleanup_expired(self) -> dict:
+        """Delete rows past TTL from both dedup tables; return per-table counts.
+
+        The TTL-aware upserts already prevent stale rows from blocking new
+        work, so this is purely housekeeping — bounds table growth and
+        keeps test isolation cheap. Safe to run concurrently with other
+        writers: a same-key INSERT racing the DELETE will either find no
+        row (acts as a fresh insert) or find a fresh row (re-created
+        between DELETE and conflict-check), neither of which is wrong.
+        """
+        task_deleted = self._conn.execute(
+            text(
+                """
+                DELETE FROM task_dedup
+                WHERE created_at <= now() - CAST(:ttl AS interval)
+                """
+            ),
+            {"ttl": DEDUP_TTL_INTERVAL},
+        ).rowcount
+        webhook_deleted = self._conn.execute(
+            text(
+                """
+                DELETE FROM webhook_dedup
+                WHERE created_at <= now() - CAST(:ttl AS interval)
+                """
+            ),
+            {"ttl": DEDUP_TTL_INTERVAL},
+        ).rowcount
+        return {
+            "task_dedup_deleted": int(task_deleted or 0),
+            "webhook_dedup_deleted": int(webhook_deleted or 0),
+        }
+
--- a/application/storage/db/repositories/ingest_chunk_progress.py
+++ b/application/storage/db/repositories/ingest_chunk_progress.py
@@ -0,0 +1,127 @@
+"""Repository for ``ingest_chunk_progress``; per-source resume + heartbeat."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from sqlalchemy import Connection, text
+
+from application.storage.db.base_repository import row_to_dict
+
+
+class IngestChunkProgressRepository:
+    """Read/write helpers for ``ingest_chunk_progress``."""
+
+    def __init__(self, conn: Connection) -> None:
+        self._conn = conn
+
+    def init_progress(
+        self,
+        source_id: str,
+        total_chunks: int,
+        attempt_id: Optional[str] = None,
+    ) -> dict:
+        """Upsert the progress row, scoped by ``attempt_id``.
+
+        On conflict the upsert distinguishes two cases:
+
+        - **Same attempt** (``attempt_id`` matches the stored value):
+          this is a Celery autoretry of the same task — preserve
+          ``last_index`` / ``embedded_chunks`` so the embed loop resumes
+          from the checkpoint. Only ``total_chunks`` and
+          ``last_updated`` get refreshed.
+        - **Different attempt** (a fresh invocation: manual reingest,
+          scheduled sync, or any caller that didn't pass an
+          ``attempt_id``): reset ``last_index`` to ``-1`` and
+          ``embedded_chunks`` to ``0`` so the loop starts from chunk 0.
+          This prevents a completed checkpoint from any prior run
+          poisoning the index.
+
+        ``IS NOT DISTINCT FROM`` treats two NULLs as equal — so legacy
+        rows with NULL ``attempt_id`` resume against another NULL
+        caller (e.g. test fixtures), but get reset the moment a real
+        ``attempt_id`` arrives.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                INSERT INTO ingest_chunk_progress (
+                    source_id, total_chunks, embedded_chunks, last_index,
+                    attempt_id, last_updated
+                )
+                VALUES (
+                    CAST(:source_id AS uuid), :total_chunks, 0, -1,
+                    :attempt_id, now()
+                )
+                ON CONFLICT (source_id) DO UPDATE SET
+                    total_chunks = EXCLUDED.total_chunks,
+                    last_updated = now(),
+                    last_index = CASE
+                        WHEN ingest_chunk_progress.attempt_id
+                             IS NOT DISTINCT FROM EXCLUDED.attempt_id
+                        THEN ingest_chunk_progress.last_index
+                        ELSE -1
+                    END,
+                    embedded_chunks = CASE
+                        WHEN ingest_chunk_progress.attempt_id
+                             IS NOT DISTINCT FROM EXCLUDED.attempt_id
+                        THEN ingest_chunk_progress.embedded_chunks
+                        ELSE 0
+                    END,
+                    attempt_id = EXCLUDED.attempt_id
+                RETURNING *
+                """
+            ),
+            {
+                "source_id": str(source_id),
+                "total_chunks": int(total_chunks),
+                "attempt_id": attempt_id,
+            },
+        )
+        return row_to_dict(result.fetchone())
+
+    def record_chunk(
+        self, source_id: str, last_index: int, embedded_chunks: int
+    ) -> None:
+        """Persist progress after a chunk is embedded."""
+        self._conn.execute(
+            text(
+                """
+                UPDATE ingest_chunk_progress
+                SET last_index = :last_index,
+                    embedded_chunks = :embedded_chunks,
+                    last_updated = now()
+                WHERE source_id = CAST(:source_id AS uuid)
+                """
+            ),
+            {
+                "source_id": str(source_id),
+                "last_index": int(last_index),
+                "embedded_chunks": int(embedded_chunks),
+            },
+        )
+
+    def get_progress(self, source_id: str) -> Optional[dict]:
+        """Return the progress row for ``source_id`` if it exists."""
+        result = self._conn.execute(
+            text(
+                "SELECT * FROM ingest_chunk_progress "
+                "WHERE source_id = CAST(:source_id AS uuid)"
+            ),
+            {"source_id": str(source_id)},
+        )
+        row = result.fetchone()
+        return row_to_dict(row) if row is not None else None
+
+    def bump_heartbeat(self, source_id: str) -> None:
+        """Refresh ``last_updated`` so the row looks alive to the reconciler."""
+        self._conn.execute(
+            text(
+                """
+                UPDATE ingest_chunk_progress
+                SET last_updated = now()
+                WHERE source_id = CAST(:source_id AS uuid)
+                """
+            ),
+            {"source_id": str(source_id)},
+        )
--- a/application/storage/db/repositories/pending_tool_state.py
+++ b/application/storage/db/repositories/pending_tool_state.py
@@ -7,6 +7,11 @@ Mirrors the continuation service's three operations on
 - load_state  → find_one by (conversation_id, user_id)
 - delete_state → delete_one by (conversation_id, user_id)

+Adds ``mark_resuming`` so a resumed run can claim a row without
+deleting it; a separate ``revert_stale_resuming`` flips abandoned
+``resuming`` rows back to ``pending`` so a crashed worker doesn't
+strand the user.
+
 Plus a cleanup method for the Celery beat task that replaces Mongo's
 TTL index.
 """
@@ -20,6 +25,7 @@ from typing import Optional
 from sqlalchemy import Connection, text

 from application.storage.db.base_repository import row_to_dict
+from application.storage.db.serialization import PGNativeJSONEncoder

 PENDING_STATE_TTL_SECONDS = 30 * 60  # 1800 seconds

@@ -71,19 +77,24 @@ class PendingToolStateRepository:
                    agent_config = EXCLUDED.agent_config,
                    client_tools = EXCLUDED.client_tools,
                    created_at = EXCLUDED.created_at,
-                    expires_at = EXCLUDED.expires_at
+                    expires_at = EXCLUDED.expires_at,
+                    status = 'pending',
+                    resumed_at = NULL
                RETURNING *
                """
            ),
            {
                "conv_id": conversation_id,
                "user_id": user_id,
-                "messages": json.dumps(messages),
-                "pending": json.dumps(pending_tool_calls),
-                "tools_dict": json.dumps(tools_dict),
-                "schemas": json.dumps(tool_schemas),
-                "agent_config": json.dumps(agent_config),
-                "client_tools": json.dumps(client_tools) if client_tools is not None else None,
+                "messages": json.dumps(messages, cls=PGNativeJSONEncoder),
+                "pending": json.dumps(pending_tool_calls, cls=PGNativeJSONEncoder),
+                "tools_dict": json.dumps(tools_dict, cls=PGNativeJSONEncoder),
+                "schemas": json.dumps(tool_schemas, cls=PGNativeJSONEncoder),
+                "agent_config": json.dumps(agent_config, cls=PGNativeJSONEncoder),
+                "client_tools": (
+                    json.dumps(client_tools, cls=PGNativeJSONEncoder)
+                    if client_tools is not None else None
+                ),
                "created_at": now,
                "expires_at": expires,
            },
@@ -113,6 +124,45 @@ class PendingToolStateRepository:
        )
        return result.rowcount > 0

+    def mark_resuming(self, conversation_id: str, user_id: str) -> bool:
+        """Flip a pending row to ``resuming`` and stamp ``resumed_at``."""
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE pending_tool_state
+                SET status = 'resuming', resumed_at = clock_timestamp()
+                WHERE conversation_id = CAST(:conv_id AS uuid)
+                  AND user_id = :user_id
+                  AND status = 'pending'
+                """
+            ),
+            {"conv_id": conversation_id, "user_id": user_id},
+        )
+        return result.rowcount > 0
+
+    def revert_stale_resuming(
+        self,
+        grace_seconds: int = 600,
+        ttl_extension_seconds: int = PENDING_STATE_TTL_SECONDS,
+    ) -> int:
+        """Revert ``resuming`` rows older than ``grace_seconds`` to ``pending``; bump TTL."""
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE pending_tool_state
+                SET status = 'pending',
+                    resumed_at = NULL,
+                    expires_at = clock_timestamp()
+                                 + make_interval(secs => :ttl)
+                WHERE status = 'resuming'
+                  AND resumed_at
+                      < clock_timestamp() - make_interval(secs => :grace)
+                """
+            ),
+            {"grace": grace_seconds, "ttl": ttl_extension_seconds},
+        )
+        return result.rowcount
+
    def cleanup_expired(self) -> int:
        """Delete rows where ``expires_at < now()``.

--- a/application/storage/db/repositories/reconciliation.py
+++ b/application/storage/db/repositories/reconciliation.py
@@ -0,0 +1,273 @@
+"""Repository for reconciliation sweeps over stuck durability rows."""
+
+from __future__ import annotations
+
+from sqlalchemy import Connection, text
+
+from application.storage.db.base_repository import row_to_dict
+
+
+class ReconciliationRepository:
+    """Sweeps and terminal writes for the reconciler beat task."""
+
+    def __init__(self, conn: Connection) -> None:
+        self._conn = conn
+
+    def find_and_lock_stuck_messages(
+        self, *, age_minutes: int = 5, limit: int = 100,
+    ) -> list[dict]:
+        """Lock stuck pending/streaming messages skipping live resumes.
+
+        Staleness rides on the **later of** ``cm.timestamp`` (creation)
+        and ``message_metadata.last_heartbeat_at`` (route heartbeat). An
+        in-flight stream that re-stamps the heartbeat each minute stays
+        out of the sweep; reconciler-side writes deliberately don't
+        touch either column so the per-row attempts counter advances
+        across ticks. Liveness exemption covers both ``pending`` (paused
+        waiting for resume) and ``resuming`` (actively executing)
+        ``pending_tool_state`` rows so a paused message survives until
+        the PT row's own TTL retires it.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                SELECT cm.id, cm.conversation_id, cm.user_id, cm.timestamp,
+                       cm.message_metadata
+                FROM conversation_messages cm
+                WHERE cm.status IN ('pending', 'streaming')
+                  AND cm.timestamp < now() - make_interval(mins => :age)
+                  AND COALESCE(
+                      (cm.message_metadata->>'last_heartbeat_at')::timestamptz,
+                      cm.timestamp
+                  ) < now() - make_interval(mins => :age)
+                  AND NOT EXISTS (
+                      SELECT 1
+                      FROM pending_tool_state pts
+                      WHERE pts.conversation_id = cm.conversation_id
+                        AND (
+                            (pts.status = 'pending'
+                             AND pts.expires_at > now())
+                            OR
+                            (pts.status = 'resuming'
+                             AND pts.resumed_at
+                                 > now() - interval '10 minutes')
+                        )
+                  )
+                ORDER BY cm.timestamp ASC
+                LIMIT :limit
+                FOR UPDATE OF cm SKIP LOCKED
+                """
+            ),
+            {"age": age_minutes, "limit": limit},
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
+    def find_and_lock_proposed_tool_calls(
+        self, *, age_minutes: int = 5, limit: int = 100,
+    ) -> list[dict]:
+        """Lock tool_call_attempts that never advanced past ``proposed``."""
+        result = self._conn.execute(
+            text(
+                """
+                SELECT call_id, message_id, tool_id, tool_name, action_name,
+                       arguments, attempted_at, updated_at
+                FROM tool_call_attempts
+                WHERE status = 'proposed'
+                  AND attempted_at < now() - make_interval(mins => :age)
+                ORDER BY attempted_at ASC
+                LIMIT :limit
+                FOR UPDATE SKIP LOCKED
+                """
+            ),
+            {"age": age_minutes, "limit": limit},
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
+    def find_and_lock_executed_tool_calls(
+        self, *, age_minutes: int = 15, limit: int = 100,
+    ) -> list[dict]:
+        """Lock tool_call_attempts stuck in ``executed`` past confirm window."""
+        result = self._conn.execute(
+            text(
+                """
+                SELECT call_id, message_id, tool_id, tool_name, action_name,
+                       arguments, result, attempted_at, updated_at
+                FROM tool_call_attempts
+                WHERE status = 'executed'
+                  AND updated_at < now() - make_interval(mins => :age)
+                ORDER BY updated_at ASC
+                LIMIT :limit
+                FOR UPDATE SKIP LOCKED
+                """
+            ),
+            {"age": age_minutes, "limit": limit},
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
+    def find_and_lock_stalled_ingests(
+        self, *, age_minutes: int = 30, limit: int = 100,
+    ) -> list[dict]:
+        """Lock ingest checkpoints whose heartbeat hasn't ticked recently."""
+        result = self._conn.execute(
+            text(
+                """
+                SELECT source_id, total_chunks, embedded_chunks,
+                       last_index, last_updated
+                FROM ingest_chunk_progress
+                WHERE last_updated < now() - make_interval(mins => :age)
+                  AND embedded_chunks < total_chunks
+                ORDER BY last_updated ASC
+                LIMIT :limit
+                FOR UPDATE SKIP LOCKED
+                """
+            ),
+            {"age": age_minutes, "limit": limit},
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
+    def touch_ingest_progress(self, source_id: str) -> bool:
+        """Bump ``last_updated`` so a once-stalled ingest re-enters the watch window."""
+        result = self._conn.execute(
+            text(
+                "UPDATE ingest_chunk_progress SET last_updated = now() "
+                "WHERE source_id = CAST(:sid AS uuid)"
+            ),
+            {"sid": str(source_id)},
+        )
+        return result.rowcount > 0
+
+    def increment_message_reconcile_attempts(self, message_id: str) -> int:
+        """Bump ``message_metadata.reconcile_attempts`` and return the new count."""
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE conversation_messages
+                SET message_metadata = jsonb_set(
+                    COALESCE(message_metadata, '{}'::jsonb),
+                    '{reconcile_attempts}',
+                    to_jsonb(
+                        COALESCE(
+                            (message_metadata->>'reconcile_attempts')::int,
+                            0
+                        ) + 1
+                    )
+                )
+                WHERE id = CAST(:message_id AS uuid)
+                RETURNING (message_metadata->>'reconcile_attempts')::int
+                         AS new_count
+                """
+            ),
+            {"message_id": message_id},
+        )
+        row = result.fetchone()
+        return int(row[0]) if row is not None else 0
+
+    def mark_message_failed(self, message_id: str, *, error: str) -> bool:
+        """Flip a message to ``status='failed'`` and stash ``error`` in metadata."""
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE conversation_messages
+                SET status = 'failed',
+                    message_metadata = jsonb_set(
+                        COALESCE(message_metadata, '{}'::jsonb),
+                        '{error}',
+                        to_jsonb(CAST(:error AS text))
+                    )
+                WHERE id = CAST(:message_id AS uuid)
+                """
+            ),
+            {"message_id": message_id, "error": error},
+        )
+        return result.rowcount > 0
+
+    def mark_tool_call_failed(self, call_id: str, *, error: str) -> bool:
+        """Flip a tool_call_attempts row to ``failed`` with ``error``."""
+        result = self._conn.execute(
+            text(
+                "UPDATE tool_call_attempts SET status = 'failed', "
+                "error = :error WHERE call_id = :call_id"
+            ),
+            {"call_id": call_id, "error": error},
+        )
+        return result.rowcount > 0
+
+    def find_stuck_idempotency_pending(
+        self,
+        *,
+        max_attempts: int,
+        lease_grace_seconds: int = 60,
+        limit: int = 100,
+    ) -> list[dict]:
+        """Lock ``task_dedup`` rows abandoned past the lease + retry budget.
+
+        A row is "stuck" when:
+
+        - ``status='pending'`` (lease was claimed but never finalised)
+        - ``lease_expires_at`` is past by at least ``lease_grace_seconds``
+          (the heartbeat thread is gone — the lease isn't going to come
+          back)
+        - ``attempt_count >= max_attempts`` (the poison-loop guard
+          should already have escalated this; if it hasn't, the wrapper
+          died before getting there)
+
+        These rows would otherwise sit in ``pending`` until the 24 h
+        TTL aged them out, blocking same-key retries via
+        ``_lookup_completed`` returning None for the whole window.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                SELECT idempotency_key, task_name, task_id, attempt_count,
+                       lease_owner_id, lease_expires_at, created_at
+                FROM task_dedup
+                WHERE status = 'pending'
+                  AND lease_expires_at IS NOT NULL
+                  AND lease_expires_at
+                      < now() - make_interval(secs => :grace)
+                  AND attempt_count >= :max_attempts
+                ORDER BY created_at ASC
+                LIMIT :limit
+                FOR UPDATE SKIP LOCKED
+                """
+            ),
+            {
+                "max_attempts": int(max_attempts),
+                "grace": int(lease_grace_seconds),
+                "limit": int(limit),
+            },
+        )
+        return [row_to_dict(r) for r in result.fetchall()]
+
+    def mark_idempotency_pending_failed(
+        self, key: str, *, error: str,
+    ) -> bool:
+        """Promote a stuck pending ``task_dedup`` row to ``failed``."""
+        from application.storage.db.serialization import PGNativeJSONEncoder
+        import json
+
+        result = self._conn.execute(
+            text(
+                """
+                UPDATE task_dedup
+                SET status = 'failed',
+                    result_json = CAST(:result AS jsonb),
+                    lease_owner_id = NULL,
+                    lease_expires_at = NULL
+                WHERE idempotency_key = :key
+                  AND status = 'pending'
+                """
+            ),
+            {
+                "key": key,
+                "result": json.dumps(
+                    {
+                        "success": False,
+                        "error": error,
+                        "reconciled": True,
+                    },
+                    cls=PGNativeJSONEncoder,
+                ),
+            },
+        )
+        return result.rowcount > 0
--- a/application/storage/db/repositories/stack_logs.py
+++ b/application/storage/db/repositories/stack_logs.py
@@ -13,6 +13,8 @@ import json
 from datetime import datetime
 from typing import Optional

+from application.storage.db.serialization import PGNativeJSONEncoder
+
 from sqlalchemy import Connection, text


@@ -52,7 +54,7 @@ class StackLogsRepository:
                "user_id": user_id,
                "api_key": api_key,
                "query": query,
-                "stacks": json.dumps(stacks or []),
+                "stacks": json.dumps(stacks or [], cls=PGNativeJSONEncoder),
                "timestamp": timestamp,
            },
        )
--- a/application/storage/db/repositories/token_usage.py
+++ b/application/storage/db/repositories/token_usage.py
@@ -31,6 +31,8 @@ class TokenUsageRepository:
        agent_id: Optional[str] = None,
        prompt_tokens: int = 0,
        generated_tokens: int = 0,
+        source: str = "agent_stream",
+        request_id: Optional[str] = None,
        timestamp: Optional[datetime] = None,
    ) -> None:
        # Attribution guard: the ``token_usage_attribution_chk`` CHECK
@@ -54,12 +56,16 @@ class TokenUsageRepository:
        self._conn.execute(
            text(
                """
-                INSERT INTO token_usage (user_id, api_key, agent_id, prompt_tokens, generated_tokens, timestamp)
+                INSERT INTO token_usage (
+                    user_id, api_key, agent_id,
+                    prompt_tokens, generated_tokens,
+                    source, request_id, timestamp
+                )
                VALUES (
                    :user_id, :api_key,
                    CAST(:agent_id AS uuid),
                    :prompt_tokens, :generated_tokens,
-                    COALESCE(:timestamp, now())
+                    :source, :request_id, COALESCE(:timestamp, now())
                )
                """
            ),
@@ -69,6 +75,8 @@ class TokenUsageRepository:
                "agent_id": agent_id_uuid,
                "prompt_tokens": prompt_tokens,
                "generated_tokens": generated_tokens,
+                "source": source,
+                "request_id": request_id,
                "timestamp": timestamp,
            },
        )
@@ -173,8 +181,22 @@ class TokenUsageRepository:
        user_id: Optional[str] = None,
        api_key: Optional[str] = None,
    ) -> int:
-        """Count of token_usage rows in the given time range (for request limiting)."""
-        clauses = ["timestamp >= :start", "timestamp <= :end"]
+        """Count user-initiated requests in the given time range.
+
+        A request = one ``agent_stream`` invocation. Multi-tool agent
+        runs produce multiple rows (one per LLM call) tagged with the
+        same ``request_id``; we DISTINCT on that to count the request
+        once. Pre-migration rows have ``request_id=NULL`` and are
+        counted one-per-row via the second branch (back-compat).
+        Side-channel sources (``title`` / ``compression`` /
+        ``rag_condense`` / ``fallback``) are excluded — they aren't
+        user-initiated and shouldn't tick the request limit.
+        """
+        clauses = [
+            "timestamp >= :start",
+            "timestamp <= :end",
+            "source = 'agent_stream'",
+        ]
        params: dict = {"start": start, "end": end}
        if user_id is not None:
            clauses.append("user_id = :user_id")
@@ -184,7 +206,15 @@ class TokenUsageRepository:
            params["api_key"] = api_key
        where = " AND ".join(clauses)
        result = self._conn.execute(
-            text(f"SELECT COUNT(*) FROM token_usage WHERE {where}"),
+            text(
+                f"""
+                SELECT
+                    COUNT(DISTINCT request_id) FILTER (WHERE request_id IS NOT NULL)
+                    + COUNT(*) FILTER (WHERE request_id IS NULL)
+                FROM token_usage
+                WHERE {where}
+                """
+            ),
            params,
        )
        return result.scalar()
--- a/application/storage/db/repositories/tool_call_attempts.py
+++ b/application/storage/db/repositories/tool_call_attempts.py
@@ -0,0 +1,144 @@
+"""Repository for ``tool_call_attempts``; executor's proposed/executed/failed writes."""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Optional
+
+from sqlalchemy import Connection, text
+
+from application.storage.db.serialization import PGNativeJSONEncoder
+
+
+class ToolCallAttemptsRepository:
+    def __init__(self, conn: Connection) -> None:
+        self._conn = conn
+
+    def record_proposed(
+        self,
+        call_id: str,
+        tool_name: str,
+        action_name: str,
+        arguments: Any,
+        *,
+        tool_id: Optional[str] = None,
+    ) -> bool:
+        """Insert a ``proposed`` row before the tool executes.
+
+        Returns True if a new row was created. ``ON CONFLICT DO NOTHING``
+        guards against the LLM emitting a duplicate ``call_id``: the
+        existing row stays put rather than a re-insert raising
+        ``IntegrityError``.
+        """
+        result = self._conn.execute(
+            text(
+                """
+                INSERT INTO tool_call_attempts
+                    (call_id, tool_id, tool_name, action_name, arguments, status)
+                VALUES
+                    (:call_id, CAST(:tool_id AS uuid), :tool_name,
+                     :action_name, CAST(:arguments AS jsonb), 'proposed')
+                ON CONFLICT (call_id) DO NOTHING
+                """
+            ),
+            {
+                "call_id": call_id,
+                "tool_id": tool_id,
+                "tool_name": tool_name,
+                "action_name": action_name,
+                "arguments": json.dumps(arguments if arguments is not None else {}, cls=PGNativeJSONEncoder),
+            },
+        )
+        return result.rowcount > 0
+
+    def upsert_executed(
+        self,
+        call_id: str,
+        tool_name: str,
+        action_name: str,
+        arguments: Any,
+        result: Any,
+        *,
+        tool_id: Optional[str] = None,
+        message_id: Optional[str] = None,
+        artifact_id: Optional[str] = None,
+    ) -> None:
+        """Insert OR upgrade a row to ``executed``.
+
+        Used as a fallback when ``record_proposed`` failed (DB outage)
+        and the tool ran anyway — preserves the journal so the
+        reconciler can still see the attempt.
+        """
+        result_payload: dict = {"result": result}
+        if artifact_id:
+            result_payload["artifact_id"] = artifact_id
+        self._conn.execute(
+            text(
+                """
+                INSERT INTO tool_call_attempts
+                    (call_id, tool_id, tool_name, action_name, arguments,
+                     result, message_id, status)
+                VALUES
+                    (:call_id, CAST(:tool_id AS uuid), :tool_name,
+                     :action_name, CAST(:arguments AS jsonb),
+                     CAST(:result AS jsonb), CAST(:message_id AS uuid),
+                     'executed')
+                ON CONFLICT (call_id) DO UPDATE
+                   SET status     = 'executed',
+                       result     = EXCLUDED.result,
+                       message_id = COALESCE(EXCLUDED.message_id, tool_call_attempts.message_id)
+                """
+            ),
+            {
+                "call_id": call_id,
+                "tool_id": tool_id,
+                "tool_name": tool_name,
+                "action_name": action_name,
+                "arguments": json.dumps(arguments if arguments is not None else {}, cls=PGNativeJSONEncoder),
+                "result": json.dumps(result_payload, cls=PGNativeJSONEncoder),
+                "message_id": message_id,
+            },
+        )
+
+    def mark_executed(
+        self,
+        call_id: str,
+        result: Any,
+        *,
+        message_id: Optional[str] = None,
+        artifact_id: Optional[str] = None,
+    ) -> bool:
+        """Flip ``proposed`` → ``executed`` with the tool result.
+
+        ``artifact_id`` (when present) is stored alongside ``result`` in
+        the JSONB as audit data — the reconciler reads it for diagnostic
+        alerts when escalating stuck rows to ``failed``.
+        """
+        result_payload: dict = {"result": result}
+        if artifact_id:
+            result_payload["artifact_id"] = artifact_id
+        sql = (
+            "UPDATE tool_call_attempts SET "
+            "status = 'executed', result = CAST(:result AS jsonb)"
+        )
+        params: dict[str, Any] = {
+            "call_id": call_id,
+            "result": json.dumps(result_payload, cls=PGNativeJSONEncoder),
+        }
+        if message_id is not None:
+            sql += ", message_id = CAST(:message_id AS uuid)"
+            params["message_id"] = message_id
+        sql += " WHERE call_id = :call_id"
+        result_proxy = self._conn.execute(text(sql), params)
+        return result_proxy.rowcount > 0
+
+    def mark_failed(self, call_id: str, error: str) -> bool:
+        """Flip ``proposed`` → ``failed`` with the exception text."""
+        result = self._conn.execute(
+            text(
+                "UPDATE tool_call_attempts SET status = 'failed', error = :error "
+                "WHERE call_id = :call_id"
+            ),
+            {"call_id": call_id, "error": error},
+        )
+        return result.rowcount > 0
--- a/application/storage/db/repositories/user_logs.py
+++ b/application/storage/db/repositories/user_logs.py
@@ -20,6 +20,7 @@ from typing import Optional
 from sqlalchemy import Connection, text

 from application.storage.db.base_repository import row_to_dict
+from application.storage.db.serialization import PGNativeJSONEncoder


 class UserLogsRepository:
@@ -46,7 +47,7 @@ class UserLogsRepository:
            {
                "user_id": user_id,
                "endpoint": endpoint,
-                "data": json.dumps(data, default=str) if data is not None else None,
+                "data": json.dumps(data, cls=PGNativeJSONEncoder) if data is not None else None,
                "timestamp": timestamp,
            },
        )
--- a/application/storage/db/serialization.py
+++ b/application/storage/db/serialization.py
@@ -0,0 +1,93 @@
+"""JSON-safe coercion for PG-native Python types.
+
+Postgres (via psycopg) returns native Python types — ``uuid.UUID``,
+``datetime.datetime``/``datetime.date``, ``decimal.Decimal``, ``bytes``
+— that ``json.dumps`` rejects. This module is the single place those
+coercion rules live; everywhere else should call into it.
+
+Two interfaces with identical coverage:
+
+* :func:`coerce_pg_native` — recursive walk returning a JSON-safe copy.
+  Use when you need to inspect the dict yourself or pass it to a
+  serializer that doesn't accept a custom encoder (e.g. SQLAlchemy
+  parameter binding for a JSONB column).
+* :class:`PGNativeJSONEncoder` — ``JSONEncoder`` subclass. Use as
+  ``json.dumps(obj, cls=PGNativeJSONEncoder)`` for serialise-once flows
+  where the extra recursive walk is wasted work.
+
+Coercion rules:
+
+* ``UUID`` → canonical hex string.
+* ``datetime`` / ``date`` → ISO 8601 string.
+* ``Decimal`` → numeric string (preserves precision; ``float()`` would not).
+* ``bytes`` → base64 string. Lossless and universally JSON-safe;
+  prior code used UTF-8 with ``errors="replace"`` which silently
+  corrupted binary payloads (e.g. Gemini's ``thought_signature``).
+"""
+
+from __future__ import annotations
+
+import base64
+import binascii
+import json
+from datetime import date, datetime
+from decimal import Decimal
+from typing import Any
+from uuid import UUID
+
+
+def _coerce_scalar(obj: Any) -> Any:
+    if isinstance(obj, UUID):
+        return str(obj)
+    if isinstance(obj, (datetime, date)):
+        return obj.isoformat()
+    if isinstance(obj, Decimal):
+        return str(obj)
+    if isinstance(obj, bytes):
+        return base64.b64encode(obj).decode("ascii")
+    return obj
+
+
+def coerce_pg_native(obj: Any) -> Any:
+    """Recursively coerce PG-native types to JSON-safe equivalents.
+
+    Recurses into ``dict`` (stringifying keys, matching prior helper
+    behavior) and ``list``/``tuple`` (tuples flatten to lists since JSON
+    has no tuple type). Any other type passes through unchanged.
+    """
+    if isinstance(obj, dict):
+        return {str(k): coerce_pg_native(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [coerce_pg_native(v) for v in obj]
+    return _coerce_scalar(obj)
+
+
+def decode_base64_bytes(value: Any) -> Any:
+    """Reverse ``coerce_pg_native``'s bytes-to-base64 step.
+
+    Useful at egress points that need the original bytes back (e.g.
+    sending Gemini's ``thought_signature`` to the SDK on resume). Uses
+    ``validate=True`` so plain ASCII strings that happen to be
+    permissively decodable (e.g. ``"abcd"``) are not silently turned
+    into bytes — the original value passes through.
+    """
+    if isinstance(value, str):
+        try:
+            return base64.b64decode(value.encode("ascii"), validate=True)
+        except (binascii.Error, ValueError):
+            return value
+    return value
+
+
+class PGNativeJSONEncoder(json.JSONEncoder):
+    """``JSONEncoder`` covering UUID / datetime / date / Decimal / bytes.
+
+    Use as ``json.dumps(obj, cls=PGNativeJSONEncoder)``. Equivalent in
+    coverage to :func:`coerce_pg_native` but skips the eager walk.
+    """
+
+    def default(self, obj: Any) -> Any:
+        coerced = _coerce_scalar(obj)
+        if coerced is obj:
+            return super().default(obj)
+        return coerced
--- a/application/usage.py
+++ b/application/usage.py
@@ -1,7 +1,5 @@
-import sys
 import logging
 import time
-from datetime import datetime

 from application.storage.db.repositories.token_usage import TokenUsageRepository
 from application.storage.db.session import db_session
@@ -93,33 +91,62 @@ def _count_prompt_tokens(messages, tools=None, usage_attachments=None, **kwargs)
    return prompt_tokens


-def update_token_usage(decoded_token, user_api_key, token_usage, agent_id=None):
-    if "pytest" in sys.modules:
-        return
-    user_id = decoded_token.get("sub") if isinstance(decoded_token, dict) else None
-    normalized_agent_id = str(agent_id) if agent_id else None
+def _persist_call_usage(llm, call_usage):
+    """Write one ``token_usage`` row per LLM call. Always-on; no flag.

-    if not user_id and not user_api_key and not normalized_agent_id:
+    Source defaults to ``agent_stream`` and can be overridden per
+    instance via ``_token_usage_source`` (set on side-channel LLMs:
+    title / compression / rag_condense / fallback). A ``_request_id``
+    stamped on the LLM lets ``count_in_range`` deduplicate the multiple
+    rows produced by a single multi-tool agent run.
+    """
+    if call_usage["prompt_tokens"] == 0 and call_usage["generated_tokens"] == 0:
+        return
+    decoded_token = getattr(llm, "decoded_token", None)
+    user_id = (
+        decoded_token.get("sub") if isinstance(decoded_token, dict) else None
+    )
+    user_api_key = getattr(llm, "user_api_key", None)
+    agent_id = getattr(llm, "agent_id", None)
+    if not user_id and not user_api_key:
+        # Repository would raise on the attribution check — log instead
+        # so operators see the gap rather than crashing the stream.
        logger.warning(
-            "Skipping token usage insert: missing user_id, api_key, and agent_id"
+            "token_usage skip: no user_id/api_key on LLM instance",
+            extra={
+                "source": getattr(llm, "_token_usage_source", "agent_stream"),
+            },
        )
        return
-
    try:
        with db_session() as conn:
+            # ``timestamp`` is omitted so Postgres ``server_default
+            # = func.now()`` populates a tz-aware UTC value; passing
+            # naive ``datetime.now()`` would silently shift on
+            # non-UTC servers.
            TokenUsageRepository(conn).insert(
                user_id=user_id,
                api_key=user_api_key,
-                agent_id=normalized_agent_id,
-                prompt_tokens=token_usage["prompt_tokens"],
-                generated_tokens=token_usage["generated_tokens"],
-                timestamp=datetime.now(),
+                agent_id=str(agent_id) if agent_id else None,
+                prompt_tokens=call_usage["prompt_tokens"],
+                generated_tokens=call_usage["generated_tokens"],
+                source=(
+                    getattr(llm, "_token_usage_source", None) or "agent_stream"
+                ),
+                request_id=getattr(llm, "_request_id", None),
            )
-    except Exception as e:
-        logger.error(f"Failed to record token usage: {e}", exc_info=True)
+    except Exception:
+        logger.exception("token_usage persist failed")


 def gen_token_usage(func):
+    """Accumulate per-call token counts and write a ``token_usage`` row.
+
+    The accumulator on ``self.token_usage`` stays in place for code
+    paths that introspect it (e.g., logging, response payloads). DB
+    persistence happens here for every call so primary streams,
+    side-channel LLMs, and no-save flows all produce rows uniformly.
+    """
    def wrapper(self, model, messages, stream, tools, **kwargs):
        usage_attachments = kwargs.pop("_usage_attachments", None)
        call_usage = {"prompt_tokens": 0, "generated_tokens": 0}
@@ -133,18 +160,14 @@ def gen_token_usage(func):
        call_usage["generated_tokens"] += _count_tokens(result)
        self.token_usage["prompt_tokens"] += call_usage["prompt_tokens"]
        self.token_usage["generated_tokens"] += call_usage["generated_tokens"]
-        update_token_usage(
-            self.decoded_token,
-            self.user_api_key,
-            call_usage,
-            getattr(self, "agent_id", None),
-        )
+        _persist_call_usage(self, call_usage)
        return result

    return wrapper


 def stream_token_usage(func):
+    """Stream variant of ``gen_token_usage``. Same persistence contract."""
    def wrapper(self, model, messages, stream, tools, **kwargs):
        usage_attachments = kwargs.pop("_usage_attachments", None)
        call_usage = {"prompt_tokens": 0, "generated_tokens": 0}
@@ -173,15 +196,7 @@ def stream_token_usage(func):
                call_usage["generated_tokens"] += _count_tokens(line)
            self.token_usage["prompt_tokens"] += call_usage["prompt_tokens"]
            self.token_usage["generated_tokens"] += call_usage["generated_tokens"]
-            # Persist usage rows only on success: a partial mid-stream
-            # failure shouldn't bill the user for a response they never got.
-            if error is None:
-                update_token_usage(
-                    self.decoded_token,
-                    self.user_api_key,
-                    call_usage,
-                    getattr(self, "agent_id", None),
-                )
+            _persist_call_usage(self, call_usage)
            emit = getattr(self, "_emit_stream_finished_log", None)
            if callable(emit):
                try:
--- a/application/vectorstore/pgvector.py
+++ b/application/vectorstore/pgvector.py
@@ -1,5 +1,8 @@
 import logging
 from typing import List, Optional, Any, Dict
+
+from psycopg.types.json import Jsonb
+
 from application.core.settings import settings
 from application.vectorstore.base import BaseVectorStore
 from application.vectorstore.document_class import Document
@@ -175,7 +178,7 @@ class PGVectorStore(BaseVectorStore):
            for text, embedding, metadata in zip(texts, embeddings, metadatas):
                cursor.execute(
                    insert_query,
-                    (text, embedding, metadata, self._source_id)
+                    (text, embedding, Jsonb(metadata), self._source_id)
                )
                inserted_id = cursor.fetchone()[0]
                inserted_ids.append(str(inserted_id))
@@ -266,7 +269,7 @@ class PGVectorStore(BaseVectorStore):
            
            cursor.execute(
                insert_query,
-                (text, embeddings[0], final_metadata, self._source_id)
+                (text, embeddings[0], Jsonb(final_metadata), self._source_id)
            )
            inserted_id = cursor.fetchone()[0]
            conn.commit()
--- a/application/worker.py
+++ b/application/worker.py
@@ -6,6 +6,7 @@ import os
 import shutil
 import string
 import tempfile
+import threading
 from typing import Any, Dict
 import zipfile

@@ -22,7 +23,10 @@ from application.cache import get_redis_instance
 from application.core.settings import settings
 from application.parser.chunking import Chunker
 from application.parser.connectors.connector_creator import ConnectorCreator
-from application.parser.embedding_pipeline import embed_and_store_documents
+from application.parser.embedding_pipeline import (
+    assert_index_complete,
+    embed_and_store_documents,
+)
 from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor
 from application.parser.file.constants import SUPPORTED_SOURCE_EXTENSIONS
 from application.parser.remote.remote_creator import RemoteCreator
@@ -32,6 +36,9 @@ from application.retriever.retriever_creator import RetrieverCreator
 from application.storage.db.base_repository import looks_like_uuid
 from application.storage.db.repositories.agents import AgentsRepository
 from application.storage.db.repositories.attachments import AttachmentsRepository
+from application.storage.db.repositories.ingest_chunk_progress import (
+    IngestChunkProgressRepository,
+)
 from application.storage.db.repositories.sources import SourcesRepository
 from application.storage.db.session import db_readonly, db_session
 from application.storage.storage_creator import StorageCreator
@@ -43,6 +50,53 @@ from application.utils import count_tokens_docs, num_tokens_from_string, safe_fi
 MIN_TOKENS = 150
 MAX_TOKENS = 1250
 RECURSION_DEPTH = 2
+INGEST_HEARTBEAT_INTERVAL_SECONDS = 30
+
+# Stable namespace for deterministic source IDs derived from idempotency keys.
+# Pinned literal — do not change. Re-rolling this would mint different
+# source_ids for the same idempotency_keys across deploys, defeating the
+# retry-resume contract.
+DOCSGPT_INGEST_NAMESPACE = uuid.UUID("fa25d5d1-398b-46df-ac89-8d1c360b9bea")
+
+
+def _derive_source_id(idempotency_key):
+    """``uuid5(NS, key)`` when a key is supplied; ``uuid4()`` otherwise."""
+    if isinstance(idempotency_key, str) and idempotency_key:
+        return uuid.uuid5(DOCSGPT_INGEST_NAMESPACE, idempotency_key)
+    return uuid.uuid4()
+
+
+def _ingest_heartbeat_loop(source_id, stop_event, interval=INGEST_HEARTBEAT_INTERVAL_SECONDS):
+    """Bump ``ingest_chunk_progress.last_updated`` until ``stop_event`` is set."""
+    while not stop_event.wait(interval):
+        try:
+            with db_session() as conn:
+                IngestChunkProgressRepository(conn).bump_heartbeat(source_id)
+        except Exception as e:
+            logging.warning(
+                f"Heartbeat failed for {source_id}: {e}", exc_info=True
+            )
+
+
+def _start_ingest_heartbeat(source_id):
+    """Spawn the heartbeat daemon and return ``(thread, stop_event)``."""
+    stop_event = threading.Event()
+    thread = threading.Thread(
+        target=_ingest_heartbeat_loop,
+        args=(str(source_id), stop_event),
+        daemon=True,
+        name=f"ingest-heartbeat-{source_id}",
+    )
+    thread.start()
+    return thread, stop_event
+
+
+def _stop_ingest_heartbeat(thread, stop_event):
+    """Signal the heartbeat daemon to exit and wait briefly for it."""
+    if stop_event is not None:
+        stop_event.set()
+    if thread is not None:
+        thread.join(timeout=5)


 # Define a function to extract metadata from a given filename.
@@ -455,6 +509,7 @@ def ingest_worker(
    user,
    retriever="classic",
    file_name_map=None,
+    idempotency_key=None,
 ):
    """
    Ingest and process documents.
@@ -469,6 +524,9 @@ def ingest_worker(
        user (str): Identifier for the user initiating the ingestion (original, unsanitized).
        retriever (str): Type of retriever to use for processing the documents.
        file_name_map (dict|str|None): Optional mapping of safe relative paths to original filenames.
+        idempotency_key (str|None): When provided, the ``source_id`` is derived
+            deterministically from the key so a retried task reuses the same
+            source row instead of duplicating it.

    Returns:
        dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
@@ -575,12 +633,23 @@ def ingest_worker(

            docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

-            id = uuid.uuid4()
+            id = _derive_source_id(idempotency_key)

            vector_store_path = os.path.join(temp_dir, "vector_store")
            os.makedirs(vector_store_path, exist_ok=True)

-            embed_and_store_documents(docs, vector_store_path, id, self)
+            heartbeat_thread, heartbeat_stop = _start_ingest_heartbeat(id)
+            try:
+                embed_and_store_documents(
+                    docs, vector_store_path, id, self,
+                    attempt_id=getattr(self.request, "id", None),
+                )
+            finally:
+                _stop_ingest_heartbeat(heartbeat_thread, heartbeat_stop)
+            # Defense-in-depth: chunk-progress is the authoritative
+            # record of how many chunks landed; mismatch raises so the
+            # task fails loud rather than caching a partial index.
+            assert_index_complete(id)

            tokens = count_tokens_docs(docs)

@@ -943,6 +1012,7 @@ def remote_worker(
    sync_frequency="never",
    operation_mode="upload",
    doc_id=None,
+    idempotency_key=None,
 ):
    safe_user = safe_filename(user)
    full_path = os.path.join(directory, safe_user, uuid.uuid4().hex)
@@ -1035,14 +1105,22 @@ def remote_worker(
        )

        if operation_mode == "upload":
-            id = uuid.uuid4()
-            embed_and_store_documents(docs, full_path, id, self)
+            id = _derive_source_id(idempotency_key)
+            embed_and_store_documents(
+                docs, full_path, id, self,
+                attempt_id=getattr(self.request, "id", None),
+            )
+            assert_index_complete(id)
        elif operation_mode == "sync":
            if not doc_id:
                logging.error("Invalid doc_id provided for sync operation: %s", doc_id)
                raise ValueError("doc_id must be provided for sync operation.")
            id = str(doc_id)
-            embed_and_store_documents(docs, full_path, id, self)
+            embed_and_store_documents(
+                docs, full_path, id, self,
+                attempt_id=getattr(self.request, "id", None),
+            )
+            assert_index_complete(id)
        self.update_state(state="PROGRESS", meta={"current": 100})

        # Serialize remote_data as JSON if it's a dict (for S3, Reddit, etc.)
@@ -1248,16 +1326,10 @@ def attachment_worker(self, file_info, user):


 def agent_webhook_worker(self, agent_id, payload):
-    """
-    Process the webhook payload for an agent.
+    """Process the webhook payload for an agent.

-    Args:
-        self: Reference to the instance of the task.
-        agent_id (str): Unique identifier for the agent.
-        payload (dict): The payload data from the webhook.
-
-    Returns:
-        dict: Information about the processed webhook.
+    Raises on failure: Celery treats a returned dict as success and
+    would skip retries, leaving the caller with a stale 200.
    """
    self.update_state(state="PROGRESS", meta={"current": 1})
    try:
@@ -1283,13 +1355,13 @@ def agent_webhook_worker(self, agent_id, payload):
        input_data = json.dumps(payload)
    except Exception as e:
        logging.error(f"Error processing agent webhook: {e}", exc_info=True)
-        return {"status": "error", "error": str(e)}
+        raise
    self.update_state(state="PROGRESS", meta={"current": 50})
    try:
        result = run_agent_logic(agent_config, input_data)
    except Exception as e:
        logging.error(f"Error running agent logic: {e}", exc_info=True)
-        return {"status": "error"}
+        raise
    else:
        logging.info(
            f"Webhook processed for agent {agent_id}", extra={"agent_id": agent_id}
@@ -1312,6 +1384,7 @@ def ingest_connector(
    operation_mode: str = "upload",
    doc_id=None,
    sync_frequency: str = "never",
+    idempotency_key=None,
 ) -> Dict[str, Any]:
    """
    Ingestion for internal knowledge bases (GoogleDrive, etc.).
@@ -1328,6 +1401,8 @@ def ingest_connector(
        operation_mode: "upload" for initial ingestion, "sync" for incremental sync
        doc_id: Document ID for sync operations (required when operation_mode="sync")
        sync_frequency: How often to sync ("never", "daily", "weekly", "monthly")
+        idempotency_key: When provided, the ``source_id`` is derived
+            deterministically so a retried upload reuses the same source row.
    """
    logging.info(
        f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}"
@@ -1423,7 +1498,7 @@ def ingest_connector(
            docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

            if operation_mode == "upload":
-                id = uuid.uuid4()
+                id = _derive_source_id(idempotency_key)
            elif operation_mode == "sync":
                if not doc_id:
                    logging.error(
@@ -1440,7 +1515,11 @@ def ingest_connector(
            self.update_state(
                state="PROGRESS", meta={"current": 80, "status": "Storing documents"}
            )
-            embed_and_store_documents(docs, vector_store_path, id, self)
+            embed_and_store_documents(
+                docs, vector_store_path, id, self,
+                attempt_id=getattr(self.request, "id", None),
+            )
+            assert_index_complete(id)

            tokens = count_tokens_docs(docs)

--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -85,6 +85,13 @@ export default function App() {
          }
        >
          <Route index element={<Conversation />} />
+          {/* One dynamic route (accepting "new" or a UUID) so the
+              /c/new → /c/<id> replace doesn't remount Conversation. */}
+          <Route path="/c/:conversationId" element={<Conversation />} />
+          <Route
+            path="/agents/:agentId/c/:conversationId"
+            element={<Conversation />}
+          />
          <Route path="/settings/*" element={<Setting />} />
          <Route path="/agents/*" element={<Agents />} />
        </Route>
--- a/frontend/src/Navigation.tsx
+++ b/frontend/src/Navigation.tsx
@@ -15,6 +15,7 @@ import Github from './assets/git_nav.svg';
 import Hamburger from './assets/hamburger.svg';
 import openNewChat from './assets/openNewChat.svg';
 import Pin from './assets/pin.svg';
+import SearchIcon from './assets/search.svg';
 import AgentImage from './components/AgentImage';
 import SettingGear from './assets/settingGear.svg';
 import Spark from './assets/spark.svg';
@@ -25,6 +26,7 @@ import UnPin from './assets/unpin.svg';
 import Help from './components/Help';
 import {
  handleAbort,
+  loadConversation,
  selectQueries,
  setConversation,
  updateConversationId,
@@ -34,6 +36,7 @@ import { useDarkTheme, useMediaQuery } from './hooks';
 import useTokenAuth from './hooks/useTokenAuth';
 import DeleteConvModal from './modals/DeleteConvModal';
 import JWTModal from './modals/JWTModal';
+import SearchConversationsModal from './modals/SearchConversationsModal';
 import { ActiveState } from './models/misc';
 import { getConversations } from './preferences/preferenceApi';
 import {
@@ -50,6 +53,7 @@ import {
  setSelectedAgent,
  setSharedAgents,
 } from './preferences/preferenceSlice';
+import { AppDispatch } from './store';
 import Upload from './upload/Upload';

 interface NavigationProps {
@@ -58,7 +62,7 @@ interface NavigationProps {
 }

 export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
-  const dispatch = useDispatch();
+  const dispatch = useDispatch<AppDispatch>();
  const navigate = useNavigate();

  const { t } = useTranslation();
@@ -80,6 +84,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
  const [uploadModalState, setUploadModalState] =
    useState<ActiveState>('INACTIVE');
  const [recentAgents, setRecentAgents] = useState<Agent[]>([]);
+  const [searchOpen, setSearchOpen] = useState(false);

  const navRef = useRef<HTMLDivElement>(null);
  useEffect(() => {
@@ -182,7 +187,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
    resetConversation();
    dispatch(setSelectedAgent(agent));
    if (isMobile || isTablet) setNavOpen(!navOpen);
-    navigate('/');
+    navigate(agent.id ? `/agents/${agent.id}/c/new` : '/c/new');
  };

  const handleTogglePin = (agent: Agent) => {
@@ -200,20 +205,21 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
    try {
      dispatch(setSelectedAgent(null));

-      const response = await conversationService.getConversation(index, token);
-      if (!response.ok) {
-        navigate('/');
+      // Pre-fetch to choose the route shape (owned-agent / shared / none).
+      const result = await dispatch(
+        loadConversation({ id: index, force: true }),
+      ).unwrap();
+      // Stale: a newer load has already updated Redux; the URL is
+      // wherever that newer flow lands, leave it alone.
+      if (result.stale) return;
+      const data = result.data;
+      if (!data) {
+        navigate('/c/new');
        return;
      }

-      const data = await response.json();
-      if (!data) return;
-
-      dispatch(setConversation(data.queries));
-      dispatch(updateConversationId({ query: { conversationId: index } }));
-
      if (!data.agent_id) {
-        navigate('/');
+        navigate(`/c/${index}`);
        return;
      }

@@ -224,7 +230,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
          token,
        );
        if (!sharedResponse.ok) {
-          navigate('/');
+          navigate(`/c/${index}`);
          return;
        }
        agent = await sharedResponse.json();
@@ -232,7 +238,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
      } else {
        const agentResponse = await userService.getAgent(data.agent_id, token);
        if (!agentResponse.ok) {
-          navigate('/');
+          navigate(`/c/${index}`);
          return;
        }
        agent = await agentResponse.json();
@@ -240,12 +246,12 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
          navigate(`/agents/shared/${agent.shared_token}`);
        } else {
          await Promise.resolve(dispatch(setSelectedAgent(agent)));
-          navigate('/');
+          navigate(`/agents/${data.agent_id}/c/${index}`);
        }
      }
    } catch (error) {
      console.error('Error handling conversation click:', error);
-      navigate('/');
+      navigate('/c/new');
    }
  };

@@ -264,6 +270,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
    if (queries && queries?.length > 0) {
      resetConversation();
    }
+    navigate('/c/new');
  };

  async function updateConversationName(updatedConversation: {
@@ -275,7 +282,6 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
      .then((response) => response.json())
      .then((data) => {
        if (data) {
-          navigate('/');
          fetchConversations();
        }
      })
@@ -370,7 +376,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
          </button>
        </div>
        <NavLink
-          to={'/'}
+          to={'/c/new'}
          onClick={() => {
            if (isMobile || isTablet) {
              setNavOpen(!navOpen);
@@ -503,11 +509,23 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
          )}
          {conversations?.data && conversations.data.length > 0 ? (
            <div className="mt-7">
-              <div className="mx-4 my-auto mt-2 flex h-6 items-center justify-between gap-4 rounded-3xl">
+              <div className="mx-4 my-auto mt-2 flex h-8 items-center justify-between gap-4 rounded-3xl">
                <p className="mt-1 ml-4 text-sm font-semibold">{t('chats')}</p>
+                <button
+                  onClick={() => setSearchOpen(true)}
+                  className="hover:bg-sidebar-accent mr-2 flex h-7 w-7 items-center justify-center rounded-full"
+                  aria-label={t('modals.searchConversations.searchPlaceholder')}
+                  title={t('modals.searchConversations.searchPlaceholder')}
+                >
+                  <img
+                    src={SearchIcon}
+                    alt="search"
+                    className="h-4 w-4 opacity-70"
+                  />
+                </button>
              </div>
              <div className="conversations-container">
-                {conversations.data?.map((conversation) => (
+                {(conversations.data ?? []).map((conversation) => (
                  <ConversationTile
                    key={conversation.id}
                    conversation={conversation}
@@ -641,6 +659,17 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
        modalState={showTokenModal ? 'ACTIVE' : 'INACTIVE'}
        handleTokenSubmit={handleTokenSubmit}
      />
+      {searchOpen && (
+        <SearchConversationsModal
+          close={() => setSearchOpen(false)}
+          conversations={conversations?.data ?? []}
+          token={token}
+          onSelectConversation={(id) => {
+            handleConversationClick(id);
+            if (isMobile || isTablet) setNavOpen(false);
+          }}
+        />
+      )}
    </>
  );
 }
--- a/frontend/src/agents/AgentCard.tsx
+++ b/frontend/src/agents/AgentCard.tsx
@@ -174,7 +174,7 @@ export default function AgentCard({
    if (section === 'user') {
      if (agent.status === 'published') {
        dispatch(setSelectedAgent(agent));
-        navigate(`/`);
+        navigate(agent.id ? `/agents/${agent.id}/c/new` : '/c/new');
      }
    }
    if (section === 'shared') {
--- a/frontend/src/agents/NewAgent.tsx
+++ b/frontend/src/agents/NewAgent.tsx
@@ -565,8 +565,22 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) {
          setJsonSchemaText(jsonText);
          setJsonSchemaValid(true);
        }
-        setAgent(data);
-        initialAgentRef.current = data;
+        // Backfill required fields so older agents (created before
+        // agent_type / prompt_id / models existed) don't fail
+        // ``isPublishable()`` and leave Save permanently disabled.
+        const normalized = {
+          ...data,
+          agent_type: data.agent_type || 'classic',
+          prompt_id: data.prompt_id || 'default',
+          retriever: data.retriever || 'classic',
+          chunks: data.chunks || '2',
+          tools: data.tools || [],
+          sources: data.sources || [],
+          models: data.models || [],
+          default_model_id: data.default_model_id || '',
+        };
+        setAgent(normalized);
+        initialAgentRef.current = normalized;
      };
      getAgent();
    }
--- a/frontend/src/agents/SharedAgentCard.tsx
+++ b/frontend/src/agents/SharedAgentCard.tsx
@@ -1,8 +1,18 @@
+import { useTranslation } from 'react-i18next';
+
+import EditIcon from '../assets/edit.svg';
 import AgentImage from '../components/AgentImage';
 import { getToolDisplayName } from '../utils/toolUtils';
 import { Agent } from './types';

-export default function SharedAgentCard({ agent }: { agent: Agent }) {
+export default function SharedAgentCard({
+  agent,
+  onEdit,
+}: {
+  agent: Agent;
+  onEdit?: () => void;
+}) {
+  const { t } = useTranslation();
  // Check if shared metadata exists and has properties (type is 'any' so we validate it's a non-empty object)
  const hasSharedMetadata =
    agent.shared_metadata &&
@@ -11,14 +21,14 @@ export default function SharedAgentCard({ agent }: { agent: Agent }) {
    Object.keys(agent.shared_metadata).length > 0;
  return (
    <div className="border-border dark:border-border flex w-full max-w-[720px] flex-col rounded-3xl border p-6 shadow-xs sm:w-fit sm:min-w-[480px]">
-      <div className="flex items-center gap-3">
+      <div className="flex items-start gap-3">
        <div className="flex h-12 w-12 items-center justify-center overflow-hidden rounded-full p-1">
          <AgentImage
            src={agent.image}
            className="h-full w-full rounded-full object-contain"
          />
        </div>
-        <div className="flex max-h-[92px] w-[80%] flex-col gap-px">
+        <div className="flex max-h-[92px] flex-1 flex-col gap-px">
          <h2 className="text-foreground text-base font-semibold sm:text-lg">
            {agent.name}
          </h2>
@@ -26,6 +36,17 @@ export default function SharedAgentCard({ agent }: { agent: Agent }) {
            {agent.description}
          </p>
        </div>
+        {onEdit && (
+          <button
+            type="button"
+            onClick={onEdit}
+            className="border-border hover:bg-accent text-foreground flex shrink-0 items-center gap-1.5 rounded-full border px-3 py-1.5 text-sm font-medium transition-colors"
+            aria-label={t('agents.edit')}
+          >
+            <img src={EditIcon} alt="" className="h-3.5 w-3.5" />
+            {t('agents.edit')}
+          </button>
+        )}
      </div>
      {hasSharedMetadata && (
        <div className="mt-4 flex items-center gap-8">
--- a/frontend/src/agents/workflow/WorkflowBuilder.tsx
+++ b/frontend/src/agents/workflow/WorkflowBuilder.tsx
@@ -813,7 +813,11 @@ function WorkflowBuilderInner() {
        const response = await userService.getWorkflow(workflowId, token);
        if (!response.ok) throw new Error('Failed to fetch workflow');
        const responseData = await response.json();
-        const { workflow, nodes: apiNodes, edges: apiEdges } = responseData.data;
+        const {
+          workflow,
+          nodes: apiNodes,
+          edges: apiEdges,
+        } = responseData.data;
        const nextWorkflowName = workflow.name;
        const nextWorkflowDescription = workflow.description || '';
        const mappedNodes = apiNodes.map((n: WorkflowNode) => {
@@ -1472,7 +1476,9 @@ function WorkflowBuilderInner() {
                          {t('agents.form.advanced.systemPromptOverride')}
                        </label>
                        <p className="mt-0.5 text-[11px] text-gray-500 dark:text-gray-400">
-                          {t('agents.form.advanced.systemPromptOverrideDescription')}
+                          {t(
+                            'agents.form.advanced.systemPromptOverrideDescription',
+                          )}
                        </p>
                      </div>
                      <button
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -1,3 +1,5 @@
+import { withThrottle, type FetchLike } from './throttle';
+
 export const baseURL =
  import.meta.env.VITE_API_HOST || 'https://docsapi.arc53.com';

@@ -18,112 +20,121 @@ const getHeaders = (
  return headers;
 };

-const apiClient = {
-  get: (
-    url: string,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<any> =>
-    fetch(`${baseURL}${url}`, {
-      method: 'GET',
-      headers: getHeaders(token, headers),
-      signal,
-    }).then((response) => {
-      return response;
-    }),
+const createClient = (transport: FetchLike) => {
+  const request = (url: string, init: RequestInit): Promise<Response> =>
+    transport(`${baseURL}${url}`, init);

-  post: (
-    url: string,
-    data: any,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<any> =>
-    fetch(`${baseURL}${url}`, {
-      method: 'POST',
-      headers: getHeaders(token, headers),
-      body: JSON.stringify(data),
-      signal,
-    }).then((response) => {
-      return response;
-    }),
+  return {
+    get: (
+      url: string,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<any> =>
+      request(url, {
+        method: 'GET',
+        headers: getHeaders(token, headers),
+        signal,
+      }),

-  postFormData: (
-    url: string,
-    formData: FormData,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<Response> => {
-    return fetch(`${baseURL}${url}`, {
-      method: 'POST',
-      headers: getHeaders(token, headers, true),
-      body: formData,
-      signal,
-    });
-  },
+    post: (
+      url: string,
+      data: any,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<any> =>
+      request(url, {
+        method: 'POST',
+        headers: getHeaders(token, headers),
+        body: JSON.stringify(data),
+        signal,
+      }),

-  put: (
-    url: string,
-    data: any,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<any> =>
-    fetch(`${baseURL}${url}`, {
-      method: 'PUT',
-      headers: getHeaders(token, headers),
-      body: JSON.stringify(data),
-      signal,
-    }).then((response) => {
-      return response;
-    }),
+    postFormData: (
+      url: string,
+      formData: FormData,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<Response> =>
+      request(url, {
+        method: 'POST',
+        headers: getHeaders(token, headers, true),
+        body: formData,
+        signal,
+      }),

-  patch: (
-    url: string,
-    data: any,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<any> =>
-    fetch(`${baseURL}${url}`, {
-      method: 'PATCH',
-      headers: getHeaders(token, headers),
-      body: JSON.stringify(data),
-      signal,
-    }).then((response) => {
-      return response;
-    }),
+    put: (
+      url: string,
+      data: any,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<any> =>
+      request(url, {
+        method: 'PUT',
+        headers: getHeaders(token, headers),
+        body: JSON.stringify(data),
+        signal,
+      }),

-  putFormData: (
-    url: string,
-    formData: FormData,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<Response> => {
-    return fetch(`${baseURL}${url}`, {
-      method: 'PUT',
-      headers: getHeaders(token, headers, true),
-      body: formData,
-      signal,
-    });
-  },
+    patch: (
+      url: string,
+      data: any,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<any> =>
+      request(url, {
+        method: 'PATCH',
+        headers: getHeaders(token, headers),
+        body: JSON.stringify(data),
+        signal,
+      }),

-  delete: (
-    url: string,
-    token: string | null,
-    headers = {},
-    signal?: AbortSignal,
-  ): Promise<any> =>
-    fetch(`${baseURL}${url}`, {
-      method: 'DELETE',
-      headers: getHeaders(token, headers),
-      signal,
-    }).then((response) => {
-      return response;
-    }),
+    putFormData: (
+      url: string,
+      formData: FormData,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<Response> =>
+      request(url, {
+        method: 'PUT',
+        headers: getHeaders(token, headers, true),
+        body: formData,
+        signal,
+      }),
+
+    delete: (
+      url: string,
+      token: string | null,
+      headers = {},
+      signal?: AbortSignal,
+    ): Promise<any> =>
+      request(url, {
+        method: 'DELETE',
+        headers: getHeaders(token, headers),
+        signal,
+      }),
+  };
 };

+const apiClient = createClient((url, init) => fetch(url, init));
+
+// Throttled client for endpoints that fan out, are polled, or are commonly
+// requested concurrently from multiple components. Shares a single concurrency
+// budget and de-duplicates identical in-flight GETs.
+export const throttledApiClient = createClient(
+  withThrottle((url, init) => fetch(url, init), { debugLabel: 'api' }),
+);
+
+if (import.meta.env.DEV && typeof window !== 'undefined') {
+  (window as unknown as Record<string, unknown>).__apiClient = apiClient;
+  (window as unknown as Record<string, unknown>).__throttledApiClient =
+    throttledApiClient;
+  (window as unknown as Record<string, unknown>).__baseURL = baseURL;
+}
+
 export default apiClient;
--- a/frontend/src/api/endpoints.ts
+++ b/frontend/src/api/endpoints.ts
@@ -43,6 +43,11 @@ const endpoints = {
    DELETE_TOOL: '/api/delete_tool',
    PARSE_SPEC: '/api/parse_spec',
    SYNC_CONNECTOR: '/api/connectors/sync',
+    CONNECTOR_AUTH: (provider: string) =>
+      `/api/connectors/auth?provider=${provider}`,
+    CONNECTOR_FILES: '/api/connectors/files',
+    CONNECTOR_VALIDATE_SESSION: '/api/connectors/validate-session',
+    CONNECTOR_DISCONNECT: '/api/connectors/disconnect',
    GET_CHUNKS: (
      docId: string,
      page: number,
@@ -59,6 +64,7 @@ const endpoints = {
    UPDATE_CHUNK: '/api/update_chunk',
    STORE_ATTACHMENT: '/api/store_attachment',
    STT: '/api/stt',
+    TTS: '/api/tts',
    LIVE_STT_START: '/api/stt/live/start',
    LIVE_STT_CHUNK: '/api/stt/live/chunk',
    LIVE_STT_FINISH: '/api/stt/live/finish',
@@ -92,6 +98,9 @@ const endpoints = {
    FEEDBACK: '/api/feedback',
    CONVERSATION: (id: string) => `/api/get_single_conversation?id=${id}`,
    CONVERSATIONS: '/api/get_conversations',
+    SEARCH_CONVERSATIONS: (q: string, limit = 30) =>
+      `/api/search_conversations?q=${encodeURIComponent(q)}&limit=${limit}`,
+    MESSAGE_TAIL: (messageId: string) => `/api/messages/${messageId}/tail`,
    SHARE_CONVERSATION: (isPromptable: boolean) =>
      `/api/share?isPromptable=${isPromptable}`,
    SHARED_CONVERSATION: (identifier: string) =>
--- a/frontend/src/api/services/conversationService.ts
+++ b/frontend/src/api/services/conversationService.ts
@@ -6,18 +6,20 @@ const conversationService = {
    data: any,
    token: string | null,
    signal: AbortSignal,
+    headers: Record<string, string> = {},
  ): Promise<any> =>
-    apiClient.post(endpoints.CONVERSATION.ANSWER, data, token, {}, signal),
+    apiClient.post(endpoints.CONVERSATION.ANSWER, data, token, headers, signal),
  answerStream: (
    data: any,
    token: string | null,
    signal: AbortSignal,
+    headers: Record<string, string> = {},
  ): Promise<any> =>
    apiClient.post(
      endpoints.CONVERSATION.ANSWER_STREAMING,
      data,
      token,
-      {},
+      headers,
      signal,
    ),
  search: (data: any, token: string | null): Promise<any> =>
@@ -26,8 +28,20 @@ const conversationService = {
    apiClient.post(endpoints.CONVERSATION.FEEDBACK, data, token, {}),
  getConversation: (id: string, token: string | null): Promise<any> =>
    apiClient.get(endpoints.CONVERSATION.CONVERSATION(id), token, {}),
+  tailMessage: (messageId: string, token: string | null): Promise<any> =>
+    apiClient.get(endpoints.CONVERSATION.MESSAGE_TAIL(messageId), token, {}),
  getConversations: (token: string | null): Promise<any> =>
    apiClient.get(endpoints.CONVERSATION.CONVERSATIONS, token, {}),
+  searchConversations: (
+    query: string,
+    token: string | null,
+    limit = 30,
+  ): Promise<any> =>
+    apiClient.get(
+      endpoints.CONVERSATION.SEARCH_CONVERSATIONS(query, limit),
+      token,
+      {},
+    ),
  shareConversation: (
    isPromptable: boolean,
    data: any,
--- a/frontend/src/api/services/userService.ts
+++ b/frontend/src/api/services/userService.ts
@@ -1,11 +1,12 @@
 import { getSessionToken } from '../../utils/providerUtils';
-import apiClient from '../client';
+import apiClient, { throttledApiClient } from '../client';
 import endpoints from '../endpoints';

 const userService = {
-  getConfig: (): Promise<any> => apiClient.get(endpoints.USER.CONFIG, null),
+  getConfig: (): Promise<any> =>
+    throttledApiClient.get(endpoints.USER.CONFIG, null),
  getNewToken: (): Promise<any> =>
-    apiClient.get(endpoints.USER.NEW_TOKEN, null),
+    throttledApiClient.get(endpoints.USER.NEW_TOKEN, null),
  getDocs: (token: string | null): Promise<any> =>
    apiClient.get(`${endpoints.USER.DOCS}`, token),
  getDocsWithPagination: (query: string, token: string | null): Promise<any> =>
@@ -17,9 +18,9 @@ const userService = {
  deleteAPIKey: (data: any, token: string | null): Promise<any> =>
    apiClient.post(endpoints.USER.DELETE_API_KEY, data, token),
  getAgent: (id: string, token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.AGENT(id), token),
+    throttledApiClient.get(endpoints.USER.AGENT(id), token),
  getAgents: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.AGENTS, token),
+    throttledApiClient.get(endpoints.USER.AGENTS, token),
  createAgent: (data: any, token: string | null): Promise<any> =>
    apiClient.postFormData(endpoints.USER.CREATE_AGENT, data, token),
  updateAgent: (
@@ -31,19 +32,19 @@ const userService = {
  deleteAgent: (id: string, token: string | null): Promise<any> =>
    apiClient.delete(endpoints.USER.DELETE_AGENT(id), token),
  getPinnedAgents: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.PINNED_AGENTS, token),
+    throttledApiClient.get(endpoints.USER.PINNED_AGENTS, token),
  togglePinAgent: (id: string, token: string | null): Promise<any> =>
    apiClient.post(endpoints.USER.TOGGLE_PIN_AGENT(id), {}, token),
  getSharedAgent: (id: string, token: string | null): Promise<any> =>
    apiClient.get(endpoints.USER.SHARED_AGENT(id), token),
  getSharedAgents: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.SHARED_AGENTS, token),
+    throttledApiClient.get(endpoints.USER.SHARED_AGENTS, token),
  shareAgent: (data: any, token: string | null): Promise<any> =>
    apiClient.put(endpoints.USER.SHARE_AGENT, data, token),
  removeSharedAgent: (id: string, token: string | null): Promise<any> =>
    apiClient.delete(endpoints.USER.REMOVE_SHARED_AGENT(id), token),
  getTemplateAgents: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.TEMPLATE_AGENTS, token),
+    throttledApiClient.get(endpoints.USER.TEMPLATE_AGENTS, token),
  adoptAgent: (id: string, token: string | null): Promise<any> =>
    apiClient.post(endpoints.USER.ADOPT_AGENT(id), {}, token),
  getAgentWebhook: (id: string, token: string | null): Promise<any> =>
@@ -61,7 +62,7 @@ const userService = {
  deletePath: (docPath: string, token: string | null): Promise<any> =>
    apiClient.get(endpoints.USER.DELETE_PATH(docPath), token),
  getTaskStatus: (task_id: string, token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.TASK_STATUS(task_id), token),
+    throttledApiClient.get(endpoints.USER.TASK_STATUS(task_id), token),
  getMessageAnalytics: (data: any, token: string | null): Promise<any> =>
    apiClient.post(endpoints.USER.MESSAGE_ANALYTICS, data, token),
  getTokenAnalytics: (data: any, token: string | null): Promise<any> =>
@@ -149,7 +150,7 @@ const userService = {
    path?: string,
    search?: string,
  ): Promise<any> =>
-    apiClient.get(
+    throttledApiClient.get(
      endpoints.USER.GET_CHUNKS(docId, page, perPage, path, search),
      token,
    ),
@@ -164,7 +165,7 @@ const userService = {
  updateChunk: (data: any, token: string | null): Promise<any> =>
    apiClient.put(endpoints.USER.UPDATE_CHUNK, data, token),
  getDirectoryStructure: (docId: string, token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.DIRECTORY_STRUCTURE(docId), token),
+    throttledApiClient.get(endpoints.USER.DIRECTORY_STRUCTURE(docId), token),
  manageSourceFiles: (data: FormData, token: string | null): Promise<any> =>
    apiClient.postFormData(endpoints.USER.MANAGE_SOURCE_FILES, data, token),
  testMCPConnection: (data: any, token: string | null): Promise<any> =>
@@ -172,9 +173,9 @@ const userService = {
  saveMCPServer: (data: any, token: string | null): Promise<any> =>
    apiClient.post(endpoints.USER.MCP_SAVE_SERVER, data, token),
  getMCPOAuthStatus: (task_id: string, token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.MCP_OAUTH_STATUS(task_id), token),
+    throttledApiClient.get(endpoints.USER.MCP_OAUTH_STATUS(task_id), token),
  getMCPAuthStatus: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.MCP_AUTH_STATUS, token),
+    throttledApiClient.get(endpoints.USER.MCP_AUTH_STATUS, token),
  syncConnector: (
    docId: string,
    provider: string,
@@ -191,8 +192,50 @@ const userService = {
      token,
    );
  },
+  getConnectorAuthUrl: (provider: string, token: string | null): Promise<any> =>
+    apiClient.get(endpoints.USER.CONNECTOR_AUTH(provider), token),
+  getConnectorFiles: (
+    data: any,
+    token: string | null,
+    signal?: AbortSignal,
+  ): Promise<any> =>
+    throttledApiClient.post(
+      endpoints.USER.CONNECTOR_FILES,
+      data,
+      token,
+      {},
+      signal,
+    ),
+  validateConnectorSession: (
+    provider: string,
+    token: string | null,
+  ): Promise<any> =>
+    apiClient.post(
+      endpoints.USER.CONNECTOR_VALIDATE_SESSION,
+      {
+        provider,
+        session_token: getSessionToken(provider),
+      },
+      token,
+    ),
+  disconnectConnector: (
+    provider: string,
+    sessionToken: string,
+    token: string | null,
+  ): Promise<any> =>
+    apiClient.post(
+      endpoints.USER.CONNECTOR_DISCONNECT,
+      { provider, session_token: sessionToken },
+      token,
+    ),
+  textToSpeech: (
+    text: string,
+    token: string | null,
+    signal?: AbortSignal,
+  ): Promise<any> =>
+    apiClient.post(endpoints.USER.TTS, { text }, token, {}, signal),
  getAgentFolders: (token: string | null): Promise<any> =>
-    apiClient.get(endpoints.USER.AGENT_FOLDERS, token),
+    throttledApiClient.get(endpoints.USER.AGENT_FOLDERS, token),
  createAgentFolder: (
    data: { name: string; parent_id?: string },
    token: string | null,
--- a/frontend/src/api/throttle.ts
+++ b/frontend/src/api/throttle.ts
@@ -0,0 +1,223 @@
+/**
+ * Transport-layer middleware factory for the frontend API layer.
+ */
+
+export type FetchLike = (
+  input: string,
+  init?: RequestInit,
+) => Promise<Response>;
+
+export interface ThrottleConfig {
+  maxConcurrentGlobal?: number;
+  maxConcurrentPerRoute?: number;
+  dedupe?: boolean;
+  dedupeKey?: (url: string, init?: RequestInit) => string | false;
+  debugLabel?: string;
+}
+
+const DEFAULT_MAX_CONCURRENT_GLOBAL = 8;
+const DEFAULT_MAX_CONCURRENT_PER_ROUTE = 3;
+
+type QueueItem = {
+  run: () => void;
+  signal?: AbortSignal;
+  onAbort: () => void;
+};
+
+function routeKey(method: string, url: string): string {
+  let pathname = url;
+  try {
+    pathname = new URL(url, 'http://_').pathname;
+  } catch {
+    pathname = url.split('?')[0];
+  }
+  return `${method.toUpperCase()} ${pathname}`;
+}
+
+function abortError(): DOMException {
+  return new DOMException('The operation was aborted.', 'AbortError');
+}
+
+interface ThrottleState {
+  perRouteQueues: Map<string, QueueItem[]>;
+  inflightPerRoute: Map<string, number>;
+  inflightGets: Map<string, Promise<Response>>;
+  inflightGlobal: number;
+}
+
+function createState(): ThrottleState {
+  return {
+    perRouteQueues: new Map(),
+    inflightPerRoute: new Map(),
+    inflightGets: new Map(),
+    inflightGlobal: 0,
+  };
+}
+
+export function withThrottle(
+  fetchLike: FetchLike,
+  config: ThrottleConfig = {},
+): FetchLike & { __reset: () => void } {
+  const maxGlobal = config.maxConcurrentGlobal ?? DEFAULT_MAX_CONCURRENT_GLOBAL;
+  const maxPerRoute =
+    config.maxConcurrentPerRoute ?? DEFAULT_MAX_CONCURRENT_PER_ROUTE;
+  const dedupeEnabled = config.dedupe !== false;
+  const state = createState();
+
+  // Toggle in DevTools with: localStorage.setItem('debug:throttle', '1')
+  const isDebug = (): boolean => {
+    try {
+      return (
+        typeof localStorage !== 'undefined' &&
+        localStorage.getItem('debug:throttle') === '1'
+      );
+    } catch {
+      return false;
+    }
+  };
+
+  const log = (
+    event: string,
+    key: string,
+    extra?: Record<string, unknown>,
+  ): void => {
+    if (!isDebug()) return;
+    const queued = state.perRouteQueues.get(key)?.length ?? 0;
+    const perRoute = state.inflightPerRoute.get(key) ?? 0;
+    const tag = config.debugLabel
+      ? `[throttle:${config.debugLabel}]`
+      : '[throttle]';
+    console.debug(
+      `${tag} ${event} ${key} | inflight=${state.inflightGlobal}/${maxGlobal} route=${perRoute}/${maxPerRoute} queued=${queued}`,
+      extra ?? '',
+    );
+  };
+
+  const canDispatch = (key: string): boolean => {
+    const perRoute = state.inflightPerRoute.get(key) ?? 0;
+    return state.inflightGlobal < maxGlobal && perRoute < maxPerRoute;
+  };
+
+  const pumpQueues = (): void => {
+    for (const [key, queue] of state.perRouteQueues) {
+      while (queue.length > 0 && canDispatch(key)) {
+        const item = queue.shift()!;
+        item.signal?.removeEventListener('abort', item.onAbort);
+        item.run();
+      }
+      if (queue.length === 0) state.perRouteQueues.delete(key);
+    }
+  };
+
+  const enqueue = (key: string, item: QueueItem): void => {
+    let queue = state.perRouteQueues.get(key);
+    if (!queue) {
+      queue = [];
+      state.perRouteQueues.set(key, queue);
+    }
+    queue.push(item);
+  };
+
+  const acquireSlot = (key: string, signal?: AbortSignal): Promise<void> =>
+    new Promise((resolve, reject) => {
+      if (signal?.aborted) {
+        reject(abortError());
+        return;
+      }
+      const item: QueueItem = {
+        signal,
+        run: () => {
+          state.inflightGlobal += 1;
+          state.inflightPerRoute.set(
+            key,
+            (state.inflightPerRoute.get(key) ?? 0) + 1,
+          );
+          resolve();
+        },
+        onAbort: () => {
+          const queue = state.perRouteQueues.get(key);
+          if (queue) {
+            const idx = queue.indexOf(item);
+            if (idx >= 0) queue.splice(idx, 1);
+          }
+          log('abort-queued', key);
+          reject(abortError());
+        },
+      };
+      const queued = state.perRouteQueues.get(key);
+      if ((!queued || queued.length === 0) && canDispatch(key)) {
+        item.run();
+        log('dispatch', key);
+        return;
+      }
+      signal?.addEventListener('abort', item.onAbort, { once: true });
+      enqueue(key, item);
+      log('queued', key);
+    });
+
+  const releaseSlot = (key: string): void => {
+    state.inflightGlobal = Math.max(0, state.inflightGlobal - 1);
+    const next = (state.inflightPerRoute.get(key) ?? 1) - 1;
+    if (next <= 0) state.inflightPerRoute.delete(key);
+    else state.inflightPerRoute.set(key, next);
+    log('release', key);
+    pumpQueues();
+  };
+
+  const wrapped = (async (url, init = {}) => {
+    const method = (init.method ?? 'GET').toUpperCase();
+    const signal = init.signal ?? undefined;
+    const key = routeKey(method, url);
+
+    // Dedupe is restricted to GETs without a caller-supplied AbortSignal:
+    // sharing a single underlying fetch across waiters means an abort by one
+    // caller would reject the others, which is not the contract callers expect.
+    const customKey = config.dedupeKey?.(url, init);
+    const dedupeAllowed =
+      dedupeEnabled &&
+      customKey !== false &&
+      method === 'GET' &&
+      !init.body &&
+      !signal;
+    const dedupeKey = typeof customKey === 'string' ? customKey : `GET ${url}`;
+
+    if (dedupeAllowed) {
+      const existing = state.inflightGets.get(dedupeKey);
+      if (existing) {
+        log('dedupe-hit', key, { dedupeKey });
+        return existing.then((r) => r.clone());
+      }
+    }
+
+    const run = async (): Promise<Response> => {
+      await acquireSlot(key, signal);
+      try {
+        return await fetchLike(url, init);
+      } finally {
+        releaseSlot(key);
+      }
+    };
+
+    if (dedupeAllowed) {
+      const promise = run();
+      state.inflightGets.set(dedupeKey, promise);
+      promise.finally(() => {
+        if (state.inflightGets.get(dedupeKey) === promise) {
+          state.inflightGets.delete(dedupeKey);
+        }
+      });
+      return promise.then((r) => r.clone());
+    }
+
+    return run();
+  }) as FetchLike & { __reset: () => void };
+
+  wrapped.__reset = () => {
+    state.perRouteQueues.clear();
+    state.inflightPerRoute.clear();
+    state.inflightGets.clear();
+    state.inflightGlobal = 0;
+  };
+
+  return wrapped;
+}
--- a/frontend/src/components/ActionButtons.tsx
+++ b/frontend/src/components/ActionButtons.tsx
@@ -40,7 +40,7 @@ export default function ActionButtons({
        query: { conversationId: null },
      }),
    );
-    navigate('/');
+    navigate('/c/new');
  };
  return (
    <div
--- a/frontend/src/components/Chunks.tsx
+++ b/frontend/src/components/Chunks.tsx
@@ -11,6 +11,7 @@ import NoFilesIcon from '../assets/no-files.svg';
 import SearchIcon from '../assets/search.svg';
 import {
  useDarkTheme,
+  useDebouncedValue,
  useLoaderState,
  useMediaQuery,
  useOutsideAlerter,
@@ -130,6 +131,7 @@ const Chunks: React.FC<ChunksProps> = ({
  const [totalChunks, setTotalChunks] = useState(0);
  const [loading, setLoading] = useLoaderState(true);
  const [searchTerm, setSearchTerm] = useState<string>('');
+  const debouncedSearchTerm = useDebouncedValue(searchTerm, 300);
  const [editingChunk, setEditingChunk] = useState<ChunkType | null>(null);
  const [editingTitle, setEditingTitle] = useState('');
  const [editingText, setEditingText] = useState('');
@@ -151,7 +153,7 @@ const Chunks: React.FC<ChunksProps> = ({
        perPage,
        token,
        path,
-        searchTerm,
+        debouncedSearchTerm,
      );

      if (!response.ok) {
@@ -276,16 +278,12 @@ const Chunks: React.FC<ChunksProps> = ({
  };

  useEffect(() => {
-    const delayDebounceFn = setTimeout(() => {
-      if (page !== 1) {
-        setPage(1);
-      } else {
-        fetchChunks();
-      }
-    }, 300);
-
-    return () => clearTimeout(delayDebounceFn);
-  }, [searchTerm]);
+    if (page !== 1) {
+      setPage(1);
+    } else {
+      fetchChunks();
+    }
+  }, [debouncedSearchTerm]);

  useEffect(() => {
    !loading && fetchChunks();
--- a/frontend/src/components/ConnectorAuth.tsx
+++ b/frontend/src/components/ConnectorAuth.tsx
@@ -2,6 +2,7 @@ import React, { useRef } from 'react';
 import { useTranslation } from 'react-i18next';
 import { useSelector } from 'react-redux';

+import userService from '../api/services/userService';
 import { useDarkTheme } from '../hooks';
 import { selectToken } from '../preferences/preferenceSlice';

@@ -68,12 +69,9 @@ const ConnectorAuth: React.FC<ConnectorAuthProps> = ({
      completedRef.current = false;
      cleanup();

-      const apiHost = import.meta.env.VITE_API_HOST;
-      const authResponse = await fetch(
-        `${apiHost}/api/connectors/auth?provider=${provider}`,
-        {
-          headers: { Authorization: `Bearer ${token}` },
-        },
+      const authResponse = await userService.getConnectorAuthUrl(
+        provider,
+        token,
      );

      if (!authResponse.ok) {
--- a/frontend/src/components/FilePicker.tsx
+++ b/frontend/src/components/FilePicker.tsx
@@ -1,5 +1,6 @@
 import React, { useState, useEffect, useCallback, useRef } from 'react';
 import { useTranslation } from 'react-i18next';
+import userService from '../api/services/userService';
 import { formatBytes } from '../utils/stringUtils';
 import { formatDate } from '../utils/dateTimeUtils';
 import {
@@ -22,6 +23,7 @@ import {
  TableHeader,
  TableCell,
 } from './Table';
+import { useDebouncedCallback } from '../hooks';

 interface CloudFile {
  id: string;
@@ -100,7 +102,6 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({
  const [activeTab, setActiveTab] = useState<'my_files' | 'shared'>('my_files');

  const scrollContainerRef = useRef<HTMLDivElement>(null);
-  const searchTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
  const abortControllerRef = useRef<AbortController | null>(null);

  const isFolder = (file: CloudFile) => {
@@ -126,7 +127,6 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({

      setIsLoading(true);

-      const apiHost = import.meta.env.VITE_API_HOST;
      if (!pageToken) {
        setFiles([]);
      }
@@ -141,15 +141,11 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({
          search_query: searchQuery,
          shared: shared,
        };
-        const response = await fetch(`${apiHost}/api/connectors/files`, {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/json',
-            Authorization: `Bearer ${token}`,
-          },
-          body: JSON.stringify(body),
-          signal: controller.signal,
-        });
+        const response = await userService.getConnectorFiles(
+          body,
+          token,
+          controller.signal,
+        );

        const data = await response.json();
        if (data.success) {
@@ -187,20 +183,9 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({
    }

    try {
-      const apiHost = import.meta.env.VITE_API_HOST;
-      const validateResponse = await fetch(
-        `${apiHost}/api/connectors/validate-session`,
-        {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/json',
-            Authorization: `Bearer ${token}`,
-          },
-          body: JSON.stringify({
-            provider: provider,
-            session_token: sessionToken,
-          }),
-        },
+      const validateResponse = await userService.validateConnectorSession(
+        provider,
+        token,
      );

      if (!validateResponse.ok) {
@@ -292,32 +277,26 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({

  useEffect(() => {
    return () => {
-      if (searchTimeoutRef.current) {
-        clearTimeout(searchTimeoutRef.current);
-      }
      abortControllerRef.current?.abort();
    };
  }, []);

+  const debouncedLoadFiles = useDebouncedCallback((query: string) => {
+    const sessionToken = getSessionToken(provider);
+    if (sessionToken) {
+      loadCloudFiles(
+        sessionToken,
+        currentFolderId,
+        undefined,
+        query,
+        activeTab === 'shared' && !currentFolderId,
+      );
+    }
+  }, 300);
+
  const handleSearchChange = (query: string) => {
    setSearchQuery(query);
-
-    if (searchTimeoutRef.current) {
-      clearTimeout(searchTimeoutRef.current);
-    }
-
-    searchTimeoutRef.current = setTimeout(() => {
-      const sessionToken = getSessionToken(provider);
-      if (sessionToken) {
-        loadCloudFiles(
-          sessionToken,
-          currentFolderId,
-          undefined,
-          query,
-          activeTab === 'shared' && !currentFolderId,
-        );
-      }
-    }, 300);
+    debouncedLoadFiles(query);
  };

  const handleFolderClick = (folderId: string, folderName: string) => {
@@ -424,23 +403,14 @@ export const FilePicker: React.FC<CloudFilePickerProps> = ({
        onDisconnect={() => {
          const sessionToken = getSessionToken(provider);
          if (sessionToken) {
-            const apiHost = import.meta.env.VITE_API_HOST;
-            fetch(`${apiHost}/api/connectors/disconnect`, {
-              method: 'POST',
-              headers: {
-                'Content-Type': 'application/json',
-                Authorization: `Bearer ${token}`,
-              },
-              body: JSON.stringify({
-                provider: provider,
-                session_token: sessionToken,
-              }),
-            }).catch((err) =>
-              console.error(
-                `Error disconnecting from ${getProviderConfig(provider).displayName}:`,
-                err,
-              ),
-            );
+            userService
+              .disconnectConnector(provider, sessionToken, token)
+              .catch((err) =>
+                console.error(
+                  `Error disconnecting from ${getProviderConfig(provider).displayName}:`,
+                  err,
+                ),
+              );
          }

          removeSessionToken(provider);
--- a/frontend/src/components/GoogleDrivePicker.tsx
+++ b/frontend/src/components/GoogleDrivePicker.tsx
@@ -2,6 +2,7 @@ import React, { useState, useEffect } from 'react';
 import { useTranslation } from 'react-i18next';
 import useDrivePicker from 'react-google-drive-picker';

+import userService from '../api/services/userService';
 import ConnectorAuth from './ConnectorAuth';
 import {
  getSessionToken,
@@ -199,18 +200,11 @@ const GoogleDrivePicker: React.FC<GoogleDrivePickerProps> = ({
    const sessionToken = getSessionToken('google_drive');
    if (sessionToken) {
      try {
-        const apiHost = import.meta.env.VITE_API_HOST;
-        await fetch(`${apiHost}/api/connectors/disconnect`, {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/json',
-            Authorization: `Bearer ${token}`,
-          },
-          body: JSON.stringify({
-            provider: 'google_drive',
-            session_token: sessionToken,
-          }),
-        });
+        await userService.disconnectConnector(
+          'google_drive',
+          sessionToken,
+          token,
+        );
      } catch (err) {
        console.error('Error disconnecting from Google Drive:', err);
      }
--- a/frontend/src/components/TextToSpeechButton.tsx
+++ b/frontend/src/components/TextToSpeechButton.tsx
@@ -2,8 +2,7 @@ import { useState, useRef, useEffect } from 'react';
 import Speaker from '../assets/speaker.svg?react';
 import Stopspeech from '../assets/stopspeech.svg?react';
 import LoadingIcon from '../assets/Loading.svg?react'; // Add a loading icon SVG here
-
-const apiHost = import.meta.env.VITE_API_HOST || 'https://docsapi.arc53.com';
+import userService from '../api/services/userService';

 let currentlyPlayingAudio: {
  audio: HTMLAudioElement;
@@ -114,12 +113,11 @@ export default function SpeakButton({ text }: { text: string }) {
          },
        };

-        const response = await fetch(apiHost + '/api/tts', {
-          method: 'POST',
-          headers: { 'Content-Type': 'application/json' },
-          body: JSON.stringify({ text }),
-          signal: abortController.signal,
-        });
+        const response = await userService.textToSpeech(
+          text,
+          null,
+          abortController.signal,
+        );

        const data = await response.json();
        abortControllerRef.current = null;
--- a/frontend/src/conversation/Conversation.tsx
+++ b/frontend/src/conversation/Conversation.tsx
@@ -1,8 +1,11 @@
 import { useCallback, useEffect, useRef, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import { useDispatch, useSelector } from 'react-redux';
+import { useNavigate, useParams } from 'react-router-dom';

+import userService from '../api/services/userService';
 import SharedAgentCard from '../agents/SharedAgentCard';
+import { Agent } from '../agents/types';
 import ArtifactSidebar from '../components/ArtifactSidebar';
 import MessageInput from '../components/MessageInput';
 import { useMediaQuery } from '../hooks';
@@ -10,6 +13,7 @@ import {
  selectConversationId,
  selectSelectedAgent,
  selectToken,
+  setSelectedAgent,
 } from '../preferences/preferenceSlice';
 import { AppDispatch } from '../store';
 import { handleSendFeedback } from './conversationHandlers';
@@ -19,7 +23,9 @@ import { ToolCallsType } from './types';
 import {
  addQuery,
  fetchAnswer,
+  loadConversation,
  resendQuery,
+  resetConversation,
  selectQueries,
  selectStatus,
  submitToolActions,
@@ -31,6 +37,16 @@ export default function Conversation() {
  const { t } = useTranslation();
  const { isMobile } = useMediaQuery();
  const dispatch = useDispatch<AppDispatch>();
+  const navigate = useNavigate();
+  const params = useParams<{
+    conversationId?: string;
+    agentId?: string;
+  }>();
+  const urlConversationId = params.conversationId;
+  const urlAgentId = params.agentId;
+  // ``new`` is treated as empty-chat intent, not a real id to fetch.
+  const isNewChatRoute =
+    urlConversationId === undefined || urlConversationId === 'new';

  const token = useSelector(selectToken);
  const queries = useSelector(selectQueries);
@@ -42,6 +58,65 @@ export default function Conversation() {
  const [lastQueryReturnedErr, setLastQueryReturnedErr] =
    useState<boolean>(false);

+  // URL → state. Thunk short-circuits when Redux already matches.
+  useEffect(() => {
+    if (isNewChatRoute) {
+      // Skip when nothing to reset; avoids wiping the in-flight stream
+      // during the null → assigned-id replace below.
+      if (conversationId !== null) {
+        dispatch(resetConversation());
+      }
+      return;
+    }
+    if (urlConversationId && urlConversationId !== conversationId) {
+      dispatch(loadConversation({ id: urlConversationId }))
+        .unwrap()
+        .then((result) => {
+          if (result.stale) return;
+          if (result.data === null) {
+            navigate('/c/new', { replace: true });
+          }
+        })
+        .catch(() => navigate('/c/new', { replace: true }));
+    }
+  }, [urlConversationId, isNewChatRoute]);
+
+  // Agent context follows the URL. ``cancelled`` covers two races:
+  // the user switches agents before the fetch resolves, or leaves the
+  // agent route entirely; either way the late dispatch must be dropped.
+  useEffect(() => {
+    let cancelled = false;
+    if (urlAgentId) {
+      if (selectedAgent?.id !== urlAgentId) {
+        userService
+          .getAgent(urlAgentId, token)
+          .then((response) => (response.ok ? response.json() : null))
+          .then((agent: Agent | null) => {
+            if (cancelled) return;
+            if (agent) dispatch(setSelectedAgent(agent));
+          })
+          .catch((err) => {
+            if (!cancelled) console.error('Failed to load agent:', err);
+          });
+      }
+    } else if (selectedAgent !== null) {
+      dispatch(setSelectedAgent(null));
+    }
+    return () => {
+      cancelled = true;
+    };
+  }, [urlAgentId, token]);
+
+  // State → URL. ``replace`` so Back doesn't return to /c/new and
+  // reset the just-streamed chat.
+  useEffect(() => {
+    if (!isNewChatRoute || !conversationId) return;
+    const target = urlAgentId
+      ? `/agents/${urlAgentId}/c/${conversationId}`
+      : `/c/${conversationId}`;
+    navigate(target, { replace: true });
+  }, [conversationId, isNewChatRoute, urlAgentId]);
+
  const handleToolAction = useCallback(
    (callId: string, decision: 'approved' | 'denied', comment?: string) => {
      dispatch(
@@ -101,7 +176,13 @@ export default function Conversation() {
        .map((a) => ({ id: a.id as string, fileName: a.fileName }));

      if (index !== undefined) {
-        dispatch(resendQuery({ index, prompt: trimmedQuestion }));
+        dispatch(
+          resendQuery({
+            index,
+            prompt: trimmedQuestion,
+            keepIdempotencyKey: isRetry,
+          }),
+        );
        handleFetchAnswer({ question: trimmedQuestion, index });
      } else {
        if (!isRetry)
@@ -151,17 +232,22 @@ export default function Conversation() {
    } else if (question && status !== 'loading') {
      if (lastQueryReturnedErr && queries.length > 0) {
        const retryIndex = queries.length - 1;
-        dispatch(
-          updateQuery({
-            index: retryIndex,
-            query: {
-              prompt: question,
-            },
-          }),
-        );
+        // Different prompt = new logical action, fresh idempotency key.
+        const prevPrompt = queries[retryIndex].prompt;
+        const isSamePrompt = prevPrompt === question;
+        if (!isSamePrompt) {
+          dispatch(
+            updateQuery({
+              index: retryIndex,
+              query: {
+                prompt: question,
+              },
+            }),
+          );
+        }
        handleQuestion({
          question,
-          isRetry: true,
+          isRetry: isSamePrompt,
          index: retryIndex,
        });
      } else {
@@ -236,7 +322,7 @@ export default function Conversation() {
          isSplitArtifactOpen ? 'w-[60%] px-6' : 'w-full'
        }`}
      >
-        <div className="relative min-h-0 flex-1 ">
+        <div className="relative min-h-0 flex-1">
          <ConversationMessages
            handleQuestion={handleQuestion}
            handleQuestionSubmission={handleQuestionSubmission}
@@ -250,7 +336,19 @@ export default function Conversation() {
            headerContent={
              selectedAgent ? (
                <div className="flex w-full items-center justify-center py-4">
-                  <SharedAgentCard agent={selectedAgent} />
+                  <SharedAgentCard
+                    agent={selectedAgent}
+                    onEdit={
+                      selectedAgent.id
+                        ? () =>
+                            navigate(
+                              selectedAgent.agent_type === 'workflow'
+                                ? `/agents/workflow/edit/${selectedAgent.id}`
+                                : `/agents/edit/${selectedAgent.id}`,
+                            )
+                        : undefined
+                    }
+                  />
                </div>
              ) : undefined
            }
--- a/frontend/src/conversation/ConversationBubble.tsx
+++ b/frontend/src/conversation/ConversationBubble.tsx
@@ -132,6 +132,8 @@ const ConversationBubble = forwardRef<
  }, [message]);

  const handleEditClick = () => {
+    if (!editInputBox.trim() || editInputBox.trim() === (message ?? '').trim())
+      return;
    setIsEditClicked(false);
    handleUpdatedQuestionSubmission?.(editInputBox, true, questionNumber);
  };
@@ -242,8 +244,12 @@ const ConversationBubble = forwardRef<
                  {t('conversation.edit.cancel')}
                </button>
                <button
-                  className="bg-primary hover:bg-primary/90 dark:hover:bg-primary/90 rounded-full px-4 py-2 text-sm font-medium text-white transition-colors"
+                  className="bg-primary not-disabled:hover:bg-primary/90 not-disabled:dark:hover:bg-primary/90 disabled:bg-primary/30 rounded-full px-4 py-2 text-sm font-medium text-white transition-colors disabled:cursor-not-allowed"
                  onClick={handleEditClick}
+                  disabled={
+                    !editInputBox.trim() ||
+                    editInputBox.trim() === (message ?? '').trim()
+                  }
                >
                  {t('conversation.edit.update')}
                </button>
--- a/frontend/src/conversation/ConversationMessages.tsx
+++ b/frontend/src/conversation/ConversationMessages.tsx
@@ -248,32 +248,8 @@ export default function ConversationMessages({
      ? LAST_BUBBLE_MARGIN
      : DEFAULT_BUBBLE_MARGIN;

-    if (query.thought || query.response || query.tool_calls || query.research) {
-      const isCurrentlyStreaming =
-        status === 'loading' && index === queries.length - 1;
-      return (
-        <ConversationBubble
-          className={bubbleMargin}
-          key={`${index}-ANSWER`}
-          message={query.response}
-          type={'ANSWER'}
-          thought={query.thought}
-          sources={query.sources}
-          toolCalls={query.tool_calls}
-          research={query.research}
-          onOpenArtifact={onOpenArtifact}
-          onToolAction={onToolAction}
-          feedback={query.feedback}
-          isStreaming={isCurrentlyStreaming}
-          handleFeedback={
-            handleFeedback
-              ? (feedback) => handleFeedback(query, feedback, index)
-              : undefined
-          }
-        />
-      );
-    }
-
+    // Error first; reconciler-failed rows may carry partial thought/
+    // tool_calls and would otherwise fall into the answer branch.
    if (query.error) {
      const retryButton = (
        <button
@@ -303,6 +279,38 @@ export default function ConversationMessages({
      );
    }

+    // tool_calls.length, not tool_calls — empty arrays are JS-truthy.
+    const hasContent =
+      query.thought ||
+      query.response ||
+      (query.tool_calls && query.tool_calls.length > 0) ||
+      query.research;
+    if (hasContent) {
+      const isCurrentlyStreaming =
+        status === 'loading' && index === queries.length - 1;
+      return (
+        <ConversationBubble
+          className={bubbleMargin}
+          key={`${index}-ANSWER`}
+          message={query.response}
+          type={'ANSWER'}
+          thought={query.thought}
+          sources={query.sources}
+          toolCalls={query.tool_calls}
+          research={query.research}
+          onOpenArtifact={onOpenArtifact}
+          onToolAction={onToolAction}
+          feedback={query.feedback}
+          isStreaming={isCurrentlyStreaming}
+          handleFeedback={
+            handleFeedback
+              ? (feedback) => handleFeedback(query, feedback, index)
+              : undefined
+          }
+        />
+      );
+    }
+
    if (status === 'loading' && isLastMessage) {
      return (
        <div
--- a/frontend/src/conversation/ConversationTile.tsx
+++ b/frontend/src/conversation/ConversationTile.tsx
@@ -64,7 +64,10 @@ export default function ConversationTile({
  }

  function handleSaveConversation(changedConversation: ConversationProps) {
-    if (changedConversation.name.trim().length) {
+    if (
+      changedConversation.name.trim().length &&
+      changedConversation.name.trim() !== conversation.name.trim()
+    ) {
      onSave(changedConversation);
      setIsEdit(false);
    } else {
--- a/frontend/src/conversation/conversationHandlers.ts
+++ b/frontend/src/conversation/conversationHandlers.ts
@@ -15,6 +15,7 @@ export function handleFetchAnswer(
  attachments?: string[],
  save_conversation = true,
  modelId?: string,
+  idempotencyKey?: string,
 ): Promise<
  | {
      result: any;
@@ -66,8 +67,10 @@ export function handleFetchAnswer(
      payload.retriever = selectedDocs[0].retriever as string;
    }
  }
+  const headers: Record<string, string> = {};
+  if (idempotencyKey) headers['Idempotency-Key'] = idempotencyKey;
  return conversationService
-    .answer(payload, token, signal)
+    .answer(payload, token, signal, headers)
    .then((response) => {
      if (response.ok) {
        return response.json();
@@ -104,6 +107,7 @@ export function handleFetchAnswerSteaming(
  attachments?: string[],
  save_conversation = true,
  modelId?: string,
+  idempotencyKey?: string,
 ): Promise<Answer> {
  const payload: RetrievalPayload = {
    question: question,
@@ -137,9 +141,11 @@ export function handleFetchAnswerSteaming(
    }
  }

+  const headers: Record<string, string> = {};
+  if (idempotencyKey) headers['Idempotency-Key'] = idempotencyKey;
  return new Promise<Answer>((resolve, reject) => {
    conversationService
-      .answerStream(payload, token, signal)
+      .answerStream(payload, token, signal, headers)
      .then((response) => {
        if (!response.body) throw Error('No response body');

@@ -199,15 +205,18 @@ export function handleSubmitToolActions(
  token: string | null,
  signal: AbortSignal,
  onEvent: (event: MessageEvent) => void,
+  idempotencyKey?: string,
 ): Promise<Answer> {
  const payload = {
    conversation_id: conversationId,
    tool_actions: toolActions,
  };

+  const headers: Record<string, string> = {};
+  if (idempotencyKey) headers['Idempotency-Key'] = idempotencyKey;
  return new Promise<Answer>((resolve, reject) => {
    conversationService
-      .answerStream(payload, token, signal)
+      .answerStream(payload, token, signal, headers)
      .then((response) => {
        if (!response.body) throw Error('No response body');

--- a/frontend/src/conversation/conversationModels.ts
+++ b/frontend/src/conversation/conversationModels.ts
@@ -3,6 +3,8 @@ import { ToolCallsType } from './types';
 export type MESSAGE_TYPE = 'QUESTION' | 'ANSWER' | 'ERROR';
 export type Status = 'idle' | 'loading' | 'failed' | 'awaiting_tool_actions';
 export type FEEDBACK = 'LIKE' | 'DISLIKE' | null;
+// Mirrors ``conversation_messages.status``.
+export type MessageStatus = 'pending' | 'streaming' | 'complete' | 'failed';

 export interface Message {
  text: string;
@@ -65,6 +67,13 @@ export interface Query {
  structured?: boolean;
  schema?: object;
  research?: ResearchState;
+  // WAL placeholder id; lets the client tail an in-flight stream.
+  messageId?: string;
+  messageStatus?: MessageStatus;
+  requestId?: string;
+  lastHeartbeatAt?: string;
+  // Persisted so Retry can re-send the same key for server-side dedup.
+  idempotencyKey?: string;
 }

 export interface RetrievalPayload {
--- a/frontend/src/conversation/conversationSlice.ts
+++ b/frontend/src/conversation/conversationSlice.ts
@@ -1,5 +1,6 @@
 import { createAsyncThunk, createSlice, PayloadAction } from '@reduxjs/toolkit';

+import conversationService from '../api/services/conversationService';
 import { getConversations } from '../preferences/preferenceApi';
 import { setConversations } from '../preferences/preferenceSlice';
 import store from '../store';
@@ -7,6 +8,7 @@ import {
  clearAttachments,
  selectCompletedAttachments,
 } from '../upload/uploadSlice';
+import { newIdempotencyKey } from '../utils/idempotency';
 import {
  handleFetchAnswer,
  handleFetchAnswerSteaming,
@@ -16,12 +18,61 @@ import {
 import {
  Answer,
  ConversationState,
+  MessageStatus,
  Query,
  ResearchStep,
  Status,
 } from './conversationModels';
 import { ToolCallsType } from './types';

+// Maps a server message dict into the client ``Query`` shape. Only
+// terminal ``complete`` rows expose ``response``; non-terminal rows
+// would carry the WAL placeholder text, which must never render.
+// ``failed`` rows surface as ``error`` so they pick up Retry.
+export function mapServerQueryToClient(raw: any): Query {
+  const status = raw?.status as MessageStatus | undefined;
+  const isTerminalComplete = status === 'complete';
+  const isFailed = status === 'failed';
+  const metadata = raw?.metadata || {};
+
+  // Empty arrays are JS-truthy; coercing to undefined keeps the
+  // renderer from rendering a blank bubble for in-flight rows and
+  // matches the shape live-stream queries start with.
+  const toolCalls = Array.isArray(raw?.tool_calls) ? raw.tool_calls : undefined;
+  const sources = Array.isArray(raw?.sources) ? raw.sources : undefined;
+  const query: Query = {
+    prompt: raw?.prompt ?? '',
+    feedback: raw?.feedback ?? undefined,
+    thought: raw?.thought ?? undefined,
+    sources: sources && sources.length > 0 ? sources : undefined,
+    tool_calls: toolCalls && toolCalls.length > 0 ? toolCalls : undefined,
+    attachments: raw?.attachments ?? undefined,
+    messageId: raw?.message_id ?? undefined,
+    messageStatus: status,
+    requestId: raw?.request_id ?? undefined,
+    lastHeartbeatAt: raw?.last_heartbeat_at ?? undefined,
+  };
+
+  if (isTerminalComplete) {
+    query.response = raw?.response ?? '';
+  }
+  if (isFailed) {
+    query.error =
+      (typeof metadata.error === 'string' && metadata.error) ||
+      'Generation failed before completing.';
+  }
+  return query;
+}
+
+// Placeholder still being produced server-side; client should tail
+// rather than treat as idle.
+export function isInFlightMessage(query: Query | undefined): boolean {
+  if (!query) return false;
+  return (
+    query.messageStatus === 'pending' || query.messageStatus === 'streaming'
+  );
+}
+
 const initialState: ConversationState = {
  queries: [],
  status: 'idle',
@@ -39,6 +90,63 @@ export function handleAbort() {
  }
 }

+// Loads a conversation and applies it to the slice. Returns
+// ``{data, stale}``: ``stale`` true means a newer load has superseded
+// this one (or Redux already matches), so callers should not react to
+// the returned data; ``data`` null with ``stale`` false means 404.
+export type LoadConversationResult = {
+  data: any | null;
+  stale: boolean;
+};
+
+let loadSeq = 0;
+
+export const loadConversation = createAsyncThunk<
+  LoadConversationResult,
+  { id: string; force?: boolean }
+>('loadConversation', async ({ id, force }, { dispatch, getState }) => {
+  const seq = ++loadSeq;
+  const state = getState() as RootState;
+  const token = state.preference.token;
+  if (!force && state.conversation.conversationId === id) {
+    return { data: null, stale: true };
+  }
+  const response = await conversationService.getConversation(id, token);
+  if (!response.ok) {
+    return { data: null, stale: false };
+  }
+  const data = await response.json();
+  if (!data) return { data: null, stale: false };
+
+  // A later loadConversation has been issued; drop our writes so its
+  // result wins, and tell the caller not to navigate off our return.
+  if (seq !== loadSeq) {
+    return { data: null, stale: true };
+  }
+
+  const mappedQueries = (data.queries || []).map(mapServerQueryToClient);
+  dispatch(conversationSlice.actions.setConversation(mappedQueries));
+  dispatch(
+    conversationSlice.actions.updateConversationId({
+      query: { conversationId: id },
+    }),
+  );
+
+  // Only tail the trailing message; earlier in-flight rows are rare.
+  const lastIdx = mappedQueries.length - 1;
+  const lastQuery = mappedQueries[lastIdx];
+  if (lastQuery && lastQuery.messageId && isInFlightMessage(lastQuery)) {
+    dispatch(
+      tailInFlightMessage({
+        messageId: lastQuery.messageId,
+        index: lastIdx,
+        conversationId: id,
+      }),
+    );
+  }
+  return { data, stale: false };
+});
+
 export const fetchAnswer = createAsyncThunk<
  Answer,
  { question: string; indx?: number }
@@ -57,11 +165,30 @@ export const fetchAnswer = createAsyncThunk<
    dispatch(clearAttachments());
  }

-  const currentConversationId = state.conversation.conversationId;
+  // Mutable so the SSE handler can adopt a server-assigned id and
+  // keep passing it to reducer guards once the early ``message_id``
+  // event lands.
+  let currentConversationId = state.conversation.conversationId;
  const modelId =
    state.preference.selectedAgent?.default_model_id ||
    state.preference.selectedModel?.id;

+  // Reuse the key on the target Query when present (retry path),
+  // else mint and persist so a later retry can re-send it.
+  const targetIndexForKey =
+    indx ?? Math.max(state.conversation.queries.length - 1, 0);
+  let idempotencyKey =
+    state.conversation.queries[targetIndexForKey]?.idempotencyKey;
+  if (!idempotencyKey) {
+    idempotencyKey = newIdempotencyKey();
+    dispatch(
+      conversationSlice.actions.updateQuery({
+        index: targetIndexForKey,
+        query: { idempotencyKey },
+      }),
+    );
+  }
+
  if (state.preference) {
    const agentKey = state.preference.selectedAgent?.key;
    if (USE_V1_API && agentKey) {
@@ -79,7 +206,11 @@ export const fetchAnswer = createAsyncThunk<
          const data = JSON.parse(event.data);
          const targetIndex = indx ?? state.conversation.queries.length - 1;

-          if (currentConversationId === state.conversation.conversationId) {
+          // Live Redux check; the closure ``state`` is a stale snapshot.
+          if (
+            currentConversationId ===
+            (getState() as RootState).conversation.conversationId
+          ) {
            if (data.type === 'end') {
              dispatch(conversationSlice.actions.setStatus('idle'));
              getConversations(state.preference.token)
@@ -107,6 +238,28 @@ export const fetchAnswer = createAsyncThunk<
                  }),
                );
              }
+            } else if (data.type === 'message_id') {
+              if (data.conversation_id) {
+                const currentState = getState() as RootState;
+                if (currentState.conversation.conversationId === null) {
+                  // setConversationId leaves status='loading'; the
+                  // status-touching updateConversationId would flip it
+                  // to 'idle' and drop subsequent chunks.
+                  dispatch(
+                    conversationSlice.actions.setConversationId(
+                      data.conversation_id,
+                    ),
+                  );
+                  currentConversationId = data.conversation_id;
+                }
+              }
+              dispatch(
+                conversationSlice.actions.updateMessageMeta({
+                  index: targetIndex,
+                  messageId: data.message_id,
+                  requestId: data.request_id,
+                }),
+              );
            } else if (data.type === 'thought') {
              dispatch(
                updateThought({
@@ -171,8 +324,11 @@ export const fetchAnswer = createAsyncThunk<
          const data = JSON.parse(event.data);
          const targetIndex = indx ?? state.conversation.queries.length - 1;

-          // Only process events if they match the current conversation
-          if (currentConversationId === state.conversation.conversationId) {
+          // Live Redux check; the closure ``state`` is a stale snapshot.
+          if (
+            currentConversationId ===
+            (getState() as RootState).conversation.conversationId
+          ) {
            if (data.type === 'end') {
              dispatch(conversationSlice.actions.setStatus('idle'));
              // Only update research status if this query has research data
@@ -211,6 +367,28 @@ export const fetchAnswer = createAsyncThunk<
                  }),
                );
              }
+            } else if (data.type === 'message_id') {
+              if (data.conversation_id) {
+                const currentState = getState() as RootState;
+                if (currentState.conversation.conversationId === null) {
+                  // setConversationId leaves status='loading'; the
+                  // status-touching updateConversationId would flip it
+                  // to 'idle' and drop subsequent chunks.
+                  dispatch(
+                    conversationSlice.actions.setConversationId(
+                      data.conversation_id,
+                    ),
+                  );
+                  currentConversationId = data.conversation_id;
+                }
+              }
+              dispatch(
+                conversationSlice.actions.updateMessageMeta({
+                  index: targetIndex,
+                  messageId: data.message_id,
+                  requestId: data.request_id,
+                }),
+              );
            } else if (data.type === 'thought') {
              const result = data.thought;
              dispatch(
@@ -293,6 +471,7 @@ export const fetchAnswer = createAsyncThunk<
        attachmentIds,
        true,
        modelId,
+        idempotencyKey,
      );
    } else {
      const answer = await handleFetchAnswer(
@@ -307,6 +486,7 @@ export const fetchAnswer = createAsyncThunk<
        attachmentIds,
        true,
        modelId,
+        idempotencyKey,
      );
      if (answer) {
        let sourcesPrepped = [];
@@ -362,6 +542,67 @@ export const fetchAnswer = createAsyncThunk<
  };
 });

+// Tail-polls the placeholder until terminal status, navigation away,
+// or hard timeout. First poll fires immediately so rows that are
+// already terminal resolve without delay.
+const TAIL_POLL_INTERVAL_MS = 2000;
+const TAIL_MAX_POLL_DURATION_MS = 10 * 60 * 1000;
+
+export const tailInFlightMessage = createAsyncThunk<
+  void,
+  { messageId: string; index: number; conversationId: string }
+>(
+  'tailInFlightMessage',
+  async ({ messageId, index, conversationId }, { dispatch, getState }) => {
+    const initialState = getState() as RootState;
+    const token = initialState.preference.token;
+    const start = Date.now();
+    dispatch(conversationSlice.actions.setStatus('loading'));
+
+    while (Date.now() - start < TAIL_MAX_POLL_DURATION_MS) {
+      const cur = (getState() as RootState).conversation.conversationId;
+      if (cur !== conversationId) return;
+
+      let resp: Response;
+      try {
+        resp = await conversationService.tailMessage(messageId, token);
+      } catch {
+        await new Promise((r) => setTimeout(r, TAIL_POLL_INTERVAL_MS));
+        continue;
+      }
+
+      // 404 → row deleted (e.g. conversation wiped); bail quietly.
+      if (resp.status === 404) {
+        dispatch(conversationSlice.actions.setStatus('idle'));
+        return;
+      }
+
+      if (!resp.ok) {
+        await new Promise((r) => setTimeout(r, TAIL_POLL_INTERVAL_MS));
+        continue;
+      }
+
+      const data = await resp.json();
+      dispatch(
+        conversationSlice.actions.applyMessageTail({ index, tail: data }),
+      );
+
+      const status = data?.status as MessageStatus | undefined;
+      if (status === 'complete' || status === 'failed') {
+        dispatch(
+          conversationSlice.actions.setStatus(
+            status === 'failed' ? 'failed' : 'idle',
+          ),
+        );
+        return;
+      }
+      await new Promise((r) => setTimeout(r, TAIL_POLL_INTERVAL_MS));
+    }
+    // Hard timeout: drop status to idle so the user can interact again.
+    dispatch(conversationSlice.actions.setStatus('idle'));
+  },
+);
+
 export const submitToolActions = createAsyncThunk<
  void,
  {
@@ -379,10 +620,26 @@ export const submitToolActions = createAsyncThunk<

  const state = getState() as RootState;
  const conversationId = state.conversation.conversationId;
-  if (!conversationId) return;
+  if (!conversationId) {
+    const targetIndex = state.conversation.queries.length - 1;
+    if (targetIndex >= 0) {
+      dispatch(
+        conversationSlice.actions.raiseError({
+          conversationId: null,
+          index: targetIndex,
+          message:
+            'Cannot submit decision — the conversation was not initialized. Please retry the question.',
+        }),
+      );
+    }
+    dispatch(conversationSlice.actions.setStatus('failed'));
+    return;
+  }

  dispatch(conversationSlice.actions.setStatus('loading'));

+  // Fresh per submission: a tool decision is its own logical action.
+  const idempotencyKey = newIdempotencyKey();
  await handleSubmitToolActions(
    conversationId,
    toolActions,
@@ -403,6 +660,15 @@ export const submitToolActions = createAsyncThunk<
          });
      } else if (data.type === 'id') {
        // conversation ID already set
+      } else if (data.type === 'message_id') {
+        // Re-stamp; continuation reuses the original placeholder.
+        dispatch(
+          conversationSlice.actions.updateMessageMeta({
+            index: targetIndex,
+            messageId: data.message_id,
+            requestId: data.request_id,
+          }),
+        );
      } else if (data.type === 'thought') {
        dispatch(
          updateThought({
@@ -447,6 +713,7 @@ export const submitToolActions = createAsyncThunk<
        );
      }
    },
+    idempotencyKey,
  );
 });

@@ -462,9 +729,13 @@ export const conversationSlice = createSlice({
    },
    resendQuery(
      state,
-      action: PayloadAction<{ index: number; prompt: string }>,
+      action: PayloadAction<{
+        index: number;
+        prompt: string;
+        keepIdempotencyKey?: boolean;
+      }>,
    ) {
-      const { index, prompt } = action.payload;
+      const { index, prompt, keepIdempotencyKey } = action.payload;
      if (index < 0 || index >= state.queries.length) return;

      state.queries.splice(index + 1);
@@ -478,6 +749,15 @@ export const conversationSlice = createSlice({
      delete state.queries[index].schema;
      delete state.queries[index].feedback;
      delete state.queries[index].research;
+      // Drop stale WAL refs; the next stream's message_id event repopulates.
+      delete state.queries[index].messageId;
+      delete state.queries[index].messageStatus;
+      delete state.queries[index].requestId;
+      delete state.queries[index].lastHeartbeatAt;
+      // Retry keeps the key so the server can dedupe; Edit drops it.
+      if (!keepIdempotencyKey) {
+        delete state.queries[index].idempotencyKey;
+      }
    },
    updateStreamingQuery(
      state,
@@ -512,6 +792,11 @@ export const conversationSlice = createSlice({
      state.conversationId = action.payload.query.conversationId ?? null;
      state.status = 'idle';
    },
+    // Sets id without touching status; used mid-stream where the
+    // status-flipping updateConversationId would drop later chunks.
+    setConversationId(state, action: PayloadAction<string | null>) {
+      state.conversationId = action.payload;
+    },
    updateThought(
      state,
      action: PayloadAction<{
@@ -646,6 +931,47 @@ export const conversationSlice = createSlice({
    setStatus(state, action: PayloadAction<Status>) {
      state.status = action.payload;
    },
+    updateMessageMeta(
+      state,
+      action: PayloadAction<{
+        index: number;
+        messageId?: string;
+        requestId?: string;
+      }>,
+    ) {
+      const { index, messageId, requestId } = action.payload;
+      const query = state.queries[index];
+      if (!query) return;
+      if (messageId) query.messageId = messageId;
+      if (requestId) query.requestId = requestId;
+      // Mirror the server-side default so a refresh sees 'pending'.
+      if (!query.messageStatus) query.messageStatus = 'pending';
+    },
+    applyMessageTail(
+      state,
+      action: PayloadAction<{ index: number; tail: any }>,
+    ) {
+      const { index, tail } = action.payload;
+      const query = state.queries[index];
+      if (!query) return;
+      const status = tail?.status as MessageStatus | undefined;
+      query.messageStatus = status;
+      query.lastHeartbeatAt = tail?.last_heartbeat_at ?? query.lastHeartbeatAt;
+      if (status === 'complete') {
+        query.response = tail?.response ?? '';
+        query.thought = tail?.thought ?? query.thought;
+        query.sources = tail?.sources ?? query.sources;
+        query.tool_calls = tail?.tool_calls ?? query.tool_calls;
+        delete query.error;
+      } else if (status === 'failed') {
+        // Surface as error so the placeholder text never renders.
+        query.error =
+          (typeof tail?.error === 'string' && tail.error) ||
+          'Generation failed before completing.';
+        delete query.response;
+      }
+      // pending / streaming: untouched; spinner keeps showing.
+    },
    raiseError(
      state,
      action: PayloadAction<{
@@ -704,8 +1030,11 @@ export const {
  updateResearchPlan,
  updateResearchProgress,
  setConversation,
+  setConversationId,
  setStatus,
  raiseError,
  resetConversation,
+  applyMessageTail,
+  updateMessageMeta,
 } = conversationSlice.actions;
 export default conversationSlice.reducer;
--- a/frontend/src/hooks/index.ts
+++ b/frontend/src/hooks/index.ts
@@ -1,4 +1,10 @@
-import { useEffect, RefObject, useState } from 'react';
+import {
+  useCallback,
+  useEffect,
+  useRef,
+  useState,
+  RefObject,
+} from 'react';

 export function useOutsideAlerter<T extends HTMLElement>(
  ref: RefObject<T | null>,
@@ -113,6 +119,51 @@ export function useDarkTheme() {
  return [isDarkTheme, toggleTheme, componentMounted] as const;
 }

+export function useDebouncedValue<T>(value: T, delay = 300): T {
+  const [debounced, setDebounced] = useState<T>(value);
+
+  useEffect(() => {
+    const timer = setTimeout(() => setDebounced(value), delay);
+    return () => clearTimeout(timer);
+  }, [value, delay]);
+
+  return debounced;
+}
+
+export function useDebouncedCallback<A extends unknown[]>(
+  callback: (...args: A) => void,
+  delay = 300,
+): ((...args: A) => void) & { cancel: () => void } {
+  const callbackRef = useRef(callback);
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+
+  useEffect(() => {
+    callbackRef.current = callback;
+  }, [callback]);
+
+  const cancel = useCallback(() => {
+    if (timerRef.current) {
+      clearTimeout(timerRef.current);
+      timerRef.current = null;
+    }
+  }, []);
+
+  useEffect(() => cancel, [cancel]);
+
+  const debounced = useCallback(
+    (...args: A) => {
+      cancel();
+      timerRef.current = setTimeout(() => {
+        timerRef.current = null;
+        callbackRef.current(...args);
+      }, delay);
+    },
+    [delay, cancel],
+  );
+
+  return Object.assign(debounced, { cancel });
+}
+
 export function useLoaderState(
  initialState = false,
  delay = 250,
--- a/frontend/src/locale/de.json
+++ b/frontend/src/locale/de.json
@@ -456,6 +456,11 @@
      "create": "Erstellen",
      "option": "Benutzern weitere Eingaben erlauben"
    },
+    "searchConversations": {
+      "searchPlaceholder": "Konversationen durchsuchen",
+      "noResults": "Keine Ergebnisse gefunden",
+      "loading": "Laden..."
+    },
    "configTool": {
      "title": "Werkzeug-Konfiguration",
      "type": "Typ",
@@ -591,6 +596,7 @@
  },
  "agents": {
    "title": "Agenten",
+    "edit": "Bearbeiten",
    "description": "Entdecke und erstelle benutzerdefinierte Versionen von DocsGPT, die Anweisungen, zusätzliches Wissen und beliebige Kombinationen von Fähigkeiten kombinieren",
    "newAgent": "Neuer Agent",
    "backToAll": "Zurück zu allen Agenten",
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -486,6 +486,11 @@
      "create": "Create",
      "option": "Allow users to prompt further"
    },
+    "searchConversations": {
+      "searchPlaceholder": "Search conversations",
+      "noResults": "No results found",
+      "loading": "Loading..."
+    },
    "configTool": {
      "title": "Tool Config",
      "type": "Type",
@@ -621,6 +626,7 @@
  },
  "agents": {
    "title": "Agents",
+    "edit": "Edit",
    "description": "Discover and create custom versions of DocsGPT that combine instructions, extra knowledge, and any combination of skills",
    "newAgent": "New Agent",
    "backToAll": "Back to all agents",
--- a/frontend/src/locale/es.json
+++ b/frontend/src/locale/es.json
@@ -474,6 +474,11 @@
      "create": "Crear",
      "option": "Permitir a los usuarios realizar más consultas"
    },
+    "searchConversations": {
+      "searchPlaceholder": "Buscar conversaciones",
+      "noResults": "No se encontraron resultados",
+      "loading": "Cargando..."
+    },
    "configTool": {
      "title": "Configuración de la Herramienta",
      "type": "Tipo",
@@ -609,6 +614,7 @@
  },
  "agents": {
    "title": "Agentes",
+    "edit": "Editar",
    "description": "Descubre y crea versiones personalizadas de DocsGPT que combinan instrucciones, conocimiento adicional y cualquier combinación de habilidades",
    "newAgent": "Nuevo Agente",
    "backToAll": "Volver a todos los agentes",
--- a/frontend/src/locale/jp.json
+++ b/frontend/src/locale/jp.json
@@ -474,6 +474,11 @@
      "create": "作成",
      "option": "ユーザーがより多くのクエリを実行できるようにします。"
    },
+    "searchConversations": {
+      "searchPlaceholder": "会話を検索",
+      "noResults": "結果が見つかりません",
+      "loading": "読み込み中..."
+    },
    "configTool": {
      "title": "ツール設定",
      "type": "タイプ",
@@ -609,6 +614,7 @@
  },
  "agents": {
    "title": "エージェント",
+    "edit": "編集",
    "description": "指示、追加知識、スキルの組み合わせを含むDocsGPTのカスタムバージョンを発見して作成します",
    "newAgent": "新しいエージェント",
    "backToAll": "すべてのエージェントに戻る",
--- a/frontend/src/locale/ru.json
+++ b/frontend/src/locale/ru.json
@@ -474,6 +474,11 @@
      "create": "Создать",
      "option": "Позволить пользователям делать дополнительные запросы."
    },
+    "searchConversations": {
+      "searchPlaceholder": "Поиск разговоров",
+      "noResults": "Результаты не найдены",
+      "loading": "Загрузка..."
+    },
    "configTool": {
      "title": "Настройка инструмента",
      "type": "Тип",
@@ -609,6 +614,7 @@
  },
  "agents": {
    "title": "Агенты",
+    "edit": "Редактировать",
    "description": "Откройте и создайте пользовательские версии DocsGPT, которые объединяют инструкции, дополнительные знания и любую комбинацию навыков",
    "newAgent": "Новый Агент",
    "backToAll": "Вернуться ко всем агентам",
--- a/frontend/src/locale/zh-TW.json
+++ b/frontend/src/locale/zh-TW.json
@@ -474,6 +474,11 @@
      "create": "建立",
      "option": "允許使用者進行更多查詢"
    },
+    "searchConversations": {
+      "searchPlaceholder": "搜尋對話",
+      "noResults": "未找到結果",
+      "loading": "載入中..."
+    },
    "configTool": {
      "title": "工具設定",
      "type": "類型",
@@ -609,6 +614,7 @@
  },
  "agents": {
    "title": "代理",
+    "edit": "編輯",
    "description": "探索並創建結合指令、額外知識和任意技能組合的DocsGPT自訂版本",
    "newAgent": "新建代理",
    "backToAll": "返回所有代理",
--- a/frontend/src/locale/zh.json
+++ b/frontend/src/locale/zh.json
@@ -474,6 +474,11 @@
      "create": "创建",
      "option": "允许用户进行更多查询。"
    },
+    "searchConversations": {
+      "searchPlaceholder": "搜索对话",
+      "noResults": "未找到结果",
+      "loading": "加载中..."
+    },
    "configTool": {
      "title": "工具配置",
      "type": "类型",
@@ -609,6 +614,7 @@
  },
  "agents": {
    "title": "代理",
+    "edit": "编辑",
    "description": "发现并创建结合指令、额外知识和任意技能组合的DocsGPT自定义版本",
    "newAgent": "新建代理",
    "backToAll": "返回所有代理",
--- a/frontend/src/modals/SearchConversationsModal.tsx
+++ b/frontend/src/modals/SearchConversationsModal.tsx
@@ -0,0 +1,165 @@
+import { useEffect, useMemo, useRef, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import SearchIcon from '../assets/search.svg';
+import { searchConversations } from '../preferences/preferenceApi';
+import WrapperModal from './WrapperModal';
+
+type ConversationListItem = {
+  id: string;
+  name: string;
+  match_field?: 'name' | 'prompt' | 'response' | null;
+  match_snippet?: string | null;
+};
+
+type SearchConversationsModalProps = {
+  close: () => void;
+  conversations: ConversationListItem[];
+  token: string | null;
+  onSelectConversation: (id: string) => void;
+};
+
+// Escape regex metacharacters so the user query can be used in a RegExp.
+function escapeRegExp(value: string): string {
+  return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+function HighlightedText({ text, query }: { text: string; query: string }) {
+  const trimmed = query.trim();
+  if (!trimmed) return <>{text}</>;
+  const parts = text.split(new RegExp(`(${escapeRegExp(trimmed)})`, 'gi'));
+  return (
+    <>
+      {parts.map((part, idx) =>
+        part.toLowerCase() === trimmed.toLowerCase() ? (
+          <mark
+            key={idx}
+            className="bg-transparent font-semibold text-purple-30"
+          >
+            {part}
+          </mark>
+        ) : (
+          <span key={idx}>{part}</span>
+        ),
+      )}
+    </>
+  );
+}
+
+export default function SearchConversationsModal({
+  close,
+  conversations,
+  token,
+  onSelectConversation,
+}: SearchConversationsModalProps) {
+  const { t } = useTranslation();
+  const inputRef = useRef<HTMLInputElement>(null);
+
+  const [query, setQuery] = useState('');
+  const [results, setResults] = useState<ConversationListItem[] | null>(null);
+  const [isSearching, setIsSearching] = useState(false);
+
+  useEffect(() => {
+    inputRef.current?.focus();
+  }, []);
+
+  useEffect(() => {
+    const trimmed = query.trim();
+    if (!trimmed) {
+      setResults(null);
+      setIsSearching(false);
+      return;
+    }
+    setIsSearching(true);
+    const handle = setTimeout(() => {
+      searchConversations(trimmed, token).then((result) => {
+        setResults(result.data ?? []);
+        setIsSearching(false);
+      });
+    }, 300);
+    return () => clearTimeout(handle);
+  }, [query, token]);
+
+  const visibleConversations = useMemo(() => {
+    if (!query.trim()) return conversations;
+    return results ?? [];
+  }, [query, results, conversations]);
+
+  const handleSelect = (id: string) => {
+    onSelectConversation(id);
+    close();
+  };
+
+  const showEmptyState =
+    !!query.trim() && !isSearching && visibleConversations.length === 0;
+
+  return (
+    <WrapperModal
+      close={close}
+      className="w-[92vw] max-w-xl p-0"
+      contentClassName="max-h-[70vh]"
+    >
+      <div className="flex flex-col">
+        <div className="border-sidebar-border flex items-center gap-2 border-b px-5 py-4">
+          <img src={SearchIcon} alt="search" className="h-4 w-4 opacity-60" />
+          <input
+            ref={inputRef}
+            type="text"
+            value={query}
+            onChange={(e) => setQuery(e.target.value)}
+            placeholder={t('modals.searchConversations.searchPlaceholder')}
+            className="text-foreground placeholder:text-muted-foreground w-full bg-transparent text-sm outline-none"
+          />
+        </div>
+
+        <div className="max-h-[55vh] overflow-y-auto py-2">
+          {isSearching && (
+            <div className="text-muted-foreground px-5 py-3 text-xs">
+              {t('modals.searchConversations.loading')}
+            </div>
+          )}
+          {showEmptyState && (
+            <div className="text-muted-foreground px-5 py-3 text-xs">
+              {t('modals.searchConversations.noResults')}
+            </div>
+          )}
+          {!isSearching &&
+            visibleConversations.map((conversation) => {
+              const trimmedQuery = query.trim();
+              const showSnippet =
+                !!trimmedQuery &&
+                !!conversation.match_snippet &&
+                conversation.match_field !== 'name';
+              return (
+                <button
+                  key={conversation.id}
+                  type="button"
+                  onClick={() => handleSelect(conversation.id)}
+                  className="hover:bg-sidebar-accent text-foreground flex w-full flex-col items-start gap-0.5 px-5 py-2.5 text-left text-sm"
+                >
+                  <span className="w-full truncate">
+                    {trimmedQuery ? (
+                      <HighlightedText
+                        text={conversation.name}
+                        query={trimmedQuery}
+                      />
+                    ) : (
+                      conversation.name
+                    )}
+                  </span>
+                  {showSnippet && (
+                    <span className="text-muted-foreground line-clamp-2 w-full text-xs">
+                      <HighlightedText
+                        text={conversation.match_snippet as string}
+                        query={trimmedQuery}
+                      />
+                    </span>
+                  )}
+                </button>
+              );
+            })}
+        </div>
+      </div>
+    </WrapperModal>
+  );
+}
--- a/frontend/src/preferences/preferenceApi.ts
+++ b/frontend/src/preferences/preferenceApi.ts
@@ -85,6 +85,49 @@ export async function getConversations(
  }
 }

+export async function searchConversations(
+  query: string,
+  token: string | null,
+  limit = 30,
+): Promise<GetConversationsResult> {
+  try {
+    const response = await conversationService.searchConversations(
+      query,
+      token,
+      limit,
+    );
+
+    if (!response.ok) {
+      console.error('Error searching conversations:', response.statusText);
+      return { data: null, loading: false };
+    }
+
+    const rawData: unknown = await response.json();
+    if (!Array.isArray(rawData)) {
+      console.error(
+        'Invalid data format received from API: Expected an array.',
+        rawData,
+      );
+      return { data: null, loading: false };
+    }
+
+    const conversations: ConversationSummary[] = rawData.map((item: any) => ({
+      id: item.id,
+      name: item.name,
+      agent_id: item.agent_id ?? null,
+      match_field: item.match_field ?? null,
+      match_snippet: item.match_snippet ?? null,
+    }));
+    return { data: conversations, loading: false };
+  } catch (error) {
+    console.error(
+      'An unexpected error occurred while searching conversations:',
+      error,
+    );
+    return { data: null, loading: false };
+  }
+}
+
 export function getLocalApiKey(): string | null {
  const key = localStorage.getItem('DocsGPTApiKey');
  return key;
--- a/frontend/src/preferences/types/index.ts
+++ b/frontend/src/preferences/types/index.ts
@@ -2,6 +2,8 @@ export type ConversationSummary = {
  id: string;
  name: string;
  agent_id: string | null;
+  match_field?: 'name' | 'prompt' | 'response' | null;
+  match_snippet?: string | null;
 };

 export type GetConversationsResult = {
--- a/frontend/src/settings/Sources.tsx
+++ b/frontend/src/settings/Sources.tsx
@@ -17,7 +17,7 @@ import ContextMenu, { MenuOption } from '../components/ContextMenu';
 import Pagination from '../components/DocumentPagination';
 import DropdownMenu from '../components/DropdownMenu';
 import SkeletonLoader from '../components/SkeletonLoader';
-import { useDarkTheme, useLoaderState } from '../hooks';
+import { useDarkTheme, useDebouncedValue, useLoaderState } from '../hooks';
 import ConfirmationModal from '../modals/ConfirmationModal';
 import { ActiveState, Doc, DocumentsProps } from '../models/misc';
 import { getDocs, getDocsWithPagination } from '../preferences/preferenceApi';
@@ -58,7 +58,7 @@ export default function Sources({
  const token = useSelector(selectToken);

  const [searchTerm, setSearchTerm] = useState<string>('');
-  const [debouncedSearchTerm, setDebouncedSearchTerm] = useState<string>('');
+  const debouncedSearchTerm = useDebouncedValue(searchTerm, 500);
  const [modalState, setModalState] = useState<ActiveState>('INACTIVE');
  const [isOnboarding, setIsOnboarding] = useState<boolean>(false);
  const [loading, setLoading] = useLoaderState(false);
@@ -117,14 +117,6 @@ export default function Sources({
    document: null,
  });

-  useEffect(() => {
-    const timer = setTimeout(() => {
-      setDebouncedSearchTerm(searchTerm);
-    }, 500);
-
-    return () => clearTimeout(timer);
-  }, [searchTerm]);
-
  const refreshDocs = useCallback(
    (
      field: 'date' | 'tokens' | undefined,
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -535,6 +535,7 @@ function Upload({

    xhr.open('POST', `${apiHost}/api/upload`);
    xhr.setRequestHeader('Authorization', `Bearer ${token}`);
+    xhr.setRequestHeader('Idempotency-Key', clientTaskId);
    xhr.send(formData);
  };

@@ -662,6 +663,7 @@ function Upload({

    xhr.open('POST', endpoint);
    xhr.setRequestHeader('Authorization', `Bearer ${token}`);
+    xhr.setRequestHeader('Idempotency-Key', clientTaskId);
    xhr.send(formData);
  };

--- a/frontend/src/utils/idempotency.ts
+++ b/frontend/src/utils/idempotency.ts
@@ -0,0 +1,13 @@
+// Per-user-action key for the ``Idempotency-Key`` header. Server
+// scopes by user, so cross-user reuse is harmless.
+export function newIdempotencyKey(): string {
+  if (
+    typeof crypto !== 'undefined' &&
+    typeof crypto.randomUUID === 'function'
+  ) {
+    return crypto.randomUUID();
+  }
+  // Fallback for older Safari / jsdom; uniqueness is enough.
+  const rand = () => Math.random().toString(16).slice(2, 10);
+  return `${rand()}-${rand()}-${rand()}-${rand()}`;
+}
--- a/frontend/src/utils/providerUtils.ts
+++ b/frontend/src/utils/providerUtils.ts
@@ -3,6 +3,8 @@
 * Follows the convention: {provider}_session_token
 */

+import userService from '../api/services/userService';
+
 export const getSessionToken = (provider: string): string | null => {
  return localStorage.getItem(`${provider}_session_token`);
 };
@@ -19,16 +21,5 @@ export const validateProviderSession = async (
  token: string | null,
  provider: string,
 ) => {
-  const apiHost = import.meta.env.VITE_API_HOST;
-  return await fetch(`${apiHost}/api/connectors/validate-session`, {
-    method: 'POST',
-    headers: {
-      'Content-Type': 'application/json',
-      Authorization: `Bearer ${token}`,
-    },
-    body: JSON.stringify({
-      provider: provider,
-      session_token: getSessionToken(provider),
-    }),
-  });
+  return await userService.validateConnectorSession(provider, token);
 };
--- a/scripts/e2e/mock_llm.py
+++ b/scripts/e2e/mock_llm.py
@@ -286,8 +286,15 @@ def _stream_chat_response(
    content: str,
    tool_calls: list[dict[str, Any]] | None,
    finish_reason: str,
+    chunk_delay_ms: int = 0,
 ):
-    """Generator yielding SSE frames that match the OpenAI streaming protocol."""
+    """Generator yielding SSE frames that match the OpenAI streaming protocol.
+
+    ``chunk_delay_ms`` (controlled by ``X-Mock-LLM-Stream-Chunk-Delay-Ms``
+    header) sleeps that many milliseconds between successive SSE frames.
+    Used by durability E2E tests to simulate slow streams that survive a
+    mid-flight ``kill -9`` against the consumer.
+    """

    created = int(time.time())
    completion_id = f"chatcmpl-e2e-{digest[:12]}"
@@ -307,23 +314,60 @@ def _stream_chat_response(
            ],
        }

+    def _maybe_sleep() -> None:
+        if chunk_delay_ms > 0:
+            time.sleep(chunk_delay_ms / 1000.0)
+
    # Opening role delta — matches OpenAI's real behavior.
    yield _sse(_base_chunk({"role": "assistant", "content": ""}))

    if tool_calls:
        # Emit tool calls in one delta; content streaming is skipped when
        # tool_calls are present, matching what RAG code paths expect.
+        _maybe_sleep()
        yield _sse(_base_chunk({"tool_calls": tool_calls}))
        yield _sse(_base_chunk({}, final=True))
    else:
        chunks = _split_into_chunks(content, STREAM_CHUNK_COUNT)
        last_index = len(chunks) - 1
        for i, piece in enumerate(chunks):
+            _maybe_sleep()
            yield _sse(_base_chunk({"content": piece}, final=(i == last_index)))

    yield "data: [DONE]\n\n"


+def _read_int_header(name: str, default: int = 0, ceiling: int = 600_000) -> int:
+    """Parse an integer header with a sane upper bound (10 minutes)."""
+    raw = request.headers.get(name)
+    if not raw:
+        return default
+    try:
+        value = int(raw)
+    except (TypeError, ValueError):
+        return default
+    if value < 0:
+        return default
+    return min(value, ceiling)
+
+
+def _read_int_env(name: str, default: int = 0, ceiling: int = 600_000) -> int:
+    """Same as ``_read_int_header`` but for env vars — the durability E2E
+    script sets ``MOCK_LLM_FORCE_*_DELAY_MS`` so it can drive slow streams
+    through DocsGPT's OpenAI client without injecting per-request
+    headers."""
+    raw = os.environ.get(name)
+    if not raw:
+        return default
+    try:
+        value = int(raw)
+    except (TypeError, ValueError):
+        return default
+    if value < 0:
+        return default
+    return min(value, ceiling)
+
+
@app.post("/v1/chat/completions")
 def chat_completions() -> Response:
    payload = request.get_json(silent=True) or {}
@@ -333,6 +377,18 @@ def chat_completions() -> Response:
    digest = _compute_request_digest(payload)
    content, tool_calls, finish_reason, usage = _resolve_chat_response(payload, digest)

+    # Durability E2E hooks: per-request OR per-process delays so tests can
+    # simulate slow providers without touching fixtures or recompiling the
+    # stub. Headers win over env so a single fixture run can opt in/out.
+    upfront_delay_ms = _read_int_header("X-Mock-LLM-Total-Delay-Ms") or _read_int_env(
+        "MOCK_LLM_FORCE_TOTAL_DELAY_MS"
+    )
+    chunk_delay_ms = _read_int_header(
+        "X-Mock-LLM-Stream-Chunk-Delay-Ms"
+    ) or _read_int_env("MOCK_LLM_FORCE_STREAM_CHUNK_DELAY_MS")
+    if upfront_delay_ms > 0:
+        time.sleep(upfront_delay_ms / 1000.0)
+
    if stream:
        generator = _stream_chat_response(
            digest=digest,
@@ -340,6 +396,7 @@ def chat_completions() -> Response:
            content=content,
            tool_calls=tool_calls,
            finish_reason=finish_reason,
+            chunk_delay_ms=chunk_delay_ms,
        )
        response = Response(
            stream_with_context(generator),
--- a/scripts/qa/durability_e2e.py
+++ b/scripts/qa/durability_e2e.py
--- a/tests/agents/test_tool_action_parser.py
+++ b/tests/agents/test_tool_action_parser.py
@@ -137,6 +137,39 @@ class TestToolActionParser:
        assert action_name is None
        assert call_args is None

+    def test_parse_google_llm_string_arguments_from_resume(self):
+        # Resume path stringifies dict args for the assistant message format
+        # before re-invoking _execute_tool_action. The Google parser must
+        # decode the JSON string back to a dict so the executor's
+        # ``call_args.items()`` loop doesn't AttributeError.
+        parser = ToolActionParser("GoogleLLM")
+
+        call = Mock()
+        call.name = "search_docs_42"
+        call.arguments = '{"query": "workflows", "limit": 5}'
+
+        tool_id, action_name, call_args = parser.parse_args(call)
+
+        assert tool_id == "42"
+        assert action_name == "search_docs"
+        assert call_args == {"query": "workflows", "limit": 5}
+
+    def test_parse_google_llm_non_json_string_arguments_fall_back_to_empty_dict(self):
+        # Malformed string args fall back to ``{}`` so the executor's
+        # ``call_args.items()`` walk doesn't crash. The executor still
+        # journals the malformed call via its own type guard.
+        parser = ToolActionParser("GoogleLLM")
+
+        call = Mock()
+        call.name = "act_7"
+        call.arguments = "not json"
+
+        tool_id, action_name, call_args = parser.parse_args(call)
+
+        assert tool_id == "7"
+        assert action_name == "act"
+        assert call_args == {}
+
    def test_parse_unknown_llm_type_defaults_to_openai(self):
        parser = ToolActionParser("UnknownLLM")

--- a/tests/agents/test_tool_executor_three_phase.py
+++ b/tests/agents/test_tool_executor_three_phase.py
@@ -0,0 +1,265 @@
+"""Tests for the journaled execute path on ToolExecutor.
+
+Each tool call inserts a row into ``tool_call_attempts`` then flips
+through ``proposed → executed`` (or ``proposed → failed``). The flip
+to ``confirmed`` is owned by the message-finalize path and is only
+asserted indirectly here (rows stay in ``executed`` so the reconciler
+can pick them up).
+"""
+
+from contextlib import contextmanager
+from unittest.mock import Mock
+
+import pytest
+from sqlalchemy import text
+
+from application.agents.tool_executor import ToolExecutor
+
+
+@contextmanager
+def _yield_pg(conn):
+    """Adapter so the executor's ``db_session()`` writes land on ``pg_conn``."""
+
+    @contextmanager
+    def _yield():
+        yield conn
+
+    return _yield
+
+
+def _patch_db(monkeypatch, pg_conn):
+    """Patch all ``db_session`` entry points used by the executor and tools.
+
+    Each module imports ``db_session`` / ``db_readonly`` by name so each
+    module-level binding has to be replaced individually.
+    """
+
+    @contextmanager
+    def _use_pg():
+        yield pg_conn
+
+    targets = (
+        "application.agents.tool_executor",
+        "application.agents.tools.notes",
+        "application.agents.tools.todo_list",
+        "application.storage.db.session",
+    )
+    for module in targets:
+        monkeypatch.setattr(f"{module}.db_session", _use_pg, raising=False)
+        monkeypatch.setattr(f"{module}.db_readonly", _use_pg, raising=False)
+
+
+def _drain(gen):
+    """Exhaust a generator, returning ``(events, return_value)``."""
+    events = []
+    while True:
+        try:
+            events.append(next(gen))
+        except StopIteration as exc:
+            return events, exc.value
+
+
+def _select_attempt(pg_conn, call_id):
+    row = pg_conn.execute(
+        text("SELECT * FROM tool_call_attempts WHERE call_id = :cid"),
+        {"cid": call_id},
+    ).fetchone()
+    return row._mapping if row is not None else None
+
+
+def _make_call(name="test_action_t1", call_id="c1"):
+    call = Mock()
+    call.name = name
+    call.id = call_id
+    call.arguments = "{}"
+    return call
+
+
+@pytest.mark.unit
+class TestExecuteJournaling:
+    def test_happy_path_proposed_then_executed(
+        self, pg_conn, mock_tool_manager, monkeypatch
+    ):
+        executor = ToolExecutor(user="u")
+        monkeypatch.setattr(
+            "application.agents.tool_executor.ToolActionParser",
+            lambda _cls, **kw: Mock(
+                parse_args=Mock(return_value=("t1", "test_action", {"q": "v"}))
+            ),
+        )
+        _patch_db(monkeypatch, pg_conn)
+
+        tools_dict = {
+            "t1": {
+                "id": "00000000-0000-0000-0000-000000000001",
+                "name": "test_tool",
+                "config": {"key": "val"},
+                "actions": [
+                    {"name": "test_action", "description": "T", "parameters": {"properties": {}}},
+                ],
+            }
+        }
+
+        events, result = _drain(executor.execute(tools_dict, _make_call(), "MockLLM"))
+        assert result[0] == "Tool result"
+
+        row = _select_attempt(pg_conn, "c1")
+        assert row is not None
+        assert row["status"] == "executed"
+        assert row["tool_name"] == "test_tool"
+        assert row["action_name"] == "test_action"
+        assert row["arguments"] == {"q": "v"}
+        # Result is wrapped so a future ``artifact_id`` can ride alongside.
+        assert row["result"] == {"result": "Tool result"}
+        assert row["error"] is None
+        assert row["message_id"] is None
+
+    def test_executor_message_id_is_persisted_on_executed_row(
+        self, pg_conn, mock_tool_manager, monkeypatch
+    ):
+        """When the route stamps a placeholder message_id on the executor,
+        the journal row carries it forward so ``confirm_executed_tool_calls``
+        can later flip it to ``confirmed``.
+        """
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        # FK constraint: message_id must reference a real row.
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.create("u-mid", "msg-id-test")
+        msg = repo.reserve_message(
+            str(conv["id"]),
+            prompt="q?",
+            placeholder_response="...",
+            request_id="req-mid-1",
+            status="pending",
+        )
+        message_uuid = str(msg["id"])
+
+        executor = ToolExecutor(user="u")
+        executor.message_id = message_uuid
+        monkeypatch.setattr(
+            "application.agents.tool_executor.ToolActionParser",
+            lambda _cls, **kw: Mock(
+                parse_args=Mock(return_value=("t1", "test_action", {}))
+            ),
+        )
+        _patch_db(monkeypatch, pg_conn)
+
+        tools_dict = {
+            "t1": {
+                "id": "00000000-0000-0000-0000-000000000001",
+                "name": "test_tool",
+                "config": {"key": "val"},
+                "actions": [
+                    {"name": "test_action", "description": "T", "parameters": {"properties": {}}},
+                ],
+            }
+        }
+
+        _drain(executor.execute(tools_dict, _make_call(call_id="cm1"), "MockLLM"))
+
+        row = _select_attempt(pg_conn, "cm1")
+        assert row is not None
+        assert row["status"] == "executed"
+        assert str(row["message_id"]) == message_uuid
+
+    def test_tool_raises_marks_failed_and_reraises(
+        self, pg_conn, mock_tool_manager, monkeypatch
+    ):
+        executor = ToolExecutor(user="u")
+        monkeypatch.setattr(
+            "application.agents.tool_executor.ToolActionParser",
+            lambda _cls, **kw: Mock(
+                parse_args=Mock(return_value=("t1", "test_action", {}))
+            ),
+        )
+        _patch_db(monkeypatch, pg_conn)
+        mock_tool_manager.load_tool.return_value.execute_action.side_effect = (
+            RuntimeError("boom")
+        )
+
+        tools_dict = {
+            "t1": {
+                "id": "00000000-0000-0000-0000-000000000001",
+                "name": "test_tool",
+                "config": {"key": "val"},
+                "actions": [
+                    {"name": "test_action", "description": "T", "parameters": {"properties": {}}},
+                ],
+            }
+        }
+
+        gen = executor.execute(tools_dict, _make_call(call_id="c2"), "MockLLM")
+        with pytest.raises(RuntimeError, match="boom"):
+            _drain(gen)
+
+        row = _select_attempt(pg_conn, "c2")
+        assert row is not None
+        assert row["status"] == "failed"
+        assert row["error"] == "boom"
+
+    def test_executed_row_lingers_for_reconciler_when_no_confirm(
+        self, pg_conn, mock_tool_manager, monkeypatch
+    ):
+        """No finalize_message call → row sits in ``executed``."""
+        executor = ToolExecutor(user="u")
+        monkeypatch.setattr(
+            "application.agents.tool_executor.ToolActionParser",
+            lambda _cls, **kw: Mock(
+                parse_args=Mock(return_value=("t1", "test_action", {}))
+            ),
+        )
+        _patch_db(monkeypatch, pg_conn)
+
+        tools_dict = {
+            "t1": {
+                "id": "00000000-0000-0000-0000-000000000001",
+                "name": "test_tool",
+                "config": {"key": "val"},
+                "actions": [
+                    {"name": "test_action", "description": "T", "parameters": {"properties": {}}},
+                ],
+            }
+        }
+
+        _drain(executor.execute(tools_dict, _make_call(call_id="c3"), "MockLLM"))
+
+        row = _select_attempt(pg_conn, "c3")
+        assert row["status"] == "executed"
+        # Partial index `tool_call_attempts_pending_ts_idx` selects rows
+        # in ('proposed','executed') — the reconciler reads those.
+        assert row["status"] in ("proposed", "executed")
+
+
+@pytest.mark.unit
+class TestRepository:
+    def test_proposed_then_executed_round_trip(self, pg_conn):
+        from application.storage.db.repositories.tool_call_attempts import (
+            ToolCallAttemptsRepository,
+        )
+
+        repo = ToolCallAttemptsRepository(pg_conn)
+        assert repo.record_proposed("c-x", "tool", "act", {"a": 1}) is True
+        # Duplicate insert is a no-op; original row stays put.
+        assert repo.record_proposed("c-x", "tool", "act", {"a": 1}) is False
+        row = _select_attempt(pg_conn, "c-x")
+        assert row["status"] == "proposed"
+
+        assert repo.mark_executed("c-x", {"out": "ok"}) is True
+        row = _select_attempt(pg_conn, "c-x")
+        assert row["status"] == "executed"
+        assert row["result"] == {"result": {"out": "ok"}}
+
+    def test_mark_failed_sets_error(self, pg_conn):
+        from application.storage.db.repositories.tool_call_attempts import (
+            ToolCallAttemptsRepository,
+        )
+
+        repo = ToolCallAttemptsRepository(pg_conn)
+        repo.record_proposed("c-y", "tool", "act", {})
+        assert repo.mark_failed("c-y", "kaboom") is True
+        row = _select_attempt(pg_conn, "c-y")
+        assert row["status"] == "failed"
+        assert row["error"] == "kaboom"
--- a/tests/api/answer/routes/test_base.py
+++ b/tests/api/answer/routes/test_base.py
@@ -218,10 +218,19 @@ class TestCompleteStreamMethod:

            decoded_token = {"sub": "user123"}

+            # The fresh-question path now reserves a row before agent.gen()
+            # and calls finalize_message at end of stream — assert both fire.
            with patch.object(
-                resource.conversation_service, "save_conversation"
-            ) as mock_save:
-                mock_save.return_value = str(uuid.uuid4())
+                resource.conversation_service, "save_user_question"
+            ) as mock_reserve, patch.object(
+                resource.conversation_service, "finalize_message"
+            ) as mock_finalize:
+                mock_reserve.return_value = {
+                    "conversation_id": str(uuid.uuid4()),
+                    "message_id": str(uuid.uuid4()),
+                    "request_id": "req-1",
+                }
+                mock_finalize.return_value = True

                list(
                    resource.complete_stream(
@@ -234,7 +243,8 @@ class TestCompleteStreamMethod:
                    )
                )

-                mock_save.assert_called_once()
+                mock_reserve.assert_called_once()
+                mock_finalize.assert_called_once()



--- a/tests/api/answer/services/test_continuation_service_pg.py
+++ b/tests/api/answer/services/test_continuation_service_pg.py
@@ -45,19 +45,26 @@ class TestMakeSerializable:
        got = _make_serializable([u, {"x": u}, 1])
        assert got == [str(u), {"x": str(u)}, 1]

-    def test_bytes_decoded_to_string(self):
+    def test_bytes_base64_encoded(self):
+        # Migrated from UTF-8-replace to base64 once the helper moved to
+        # the shared serialization module — base64 is lossless and round-
+        # trippable (UTF-8-replace silently corrupted binary payloads).
+        import base64
        from application.api.answer.services.continuation_service import (
            _make_serializable,
        )
-        assert _make_serializable(b"hello") == "hello"
+        got = _make_serializable(b"hello")
+        assert got == base64.b64encode(b"hello").decode("ascii")

-    def test_bytes_invalid_utf8_replaced(self):
+    def test_bytes_arbitrary_binary_roundtrips(self):
+        import base64
        from application.api.answer.services.continuation_service import (
            _make_serializable,
        )
-        # Invalid UTF-8 byte sequence
-        got = _make_serializable(b"\xff\xfe")
+        raw = b"\xff\xfe\x00\x10"
+        got = _make_serializable(raw)
        assert isinstance(got, str)
+        assert base64.b64decode(got) == raw

    def test_passes_through_primitives(self):
        from application.api.answer.services.continuation_service import (
@@ -68,6 +75,50 @@ class TestMakeSerializable:
        assert _make_serializable(None) is None
        assert _make_serializable(True) is True

+    def test_datetime_becomes_iso_string(self):
+        # PG SELECT * pulls timestamptz columns through as datetime —
+        # tools_dict carries ``created_at``/``updated_at`` from user_tools
+        # rows, which would otherwise blow up json.dumps in pending_tool_state.
+        import json
+        from datetime import datetime, timezone
+        from application.api.answer.services.continuation_service import (
+            _make_serializable,
+        )
+
+        ts = datetime(2026, 5, 2, 12, 14, 32, tzinfo=timezone.utc)
+        got = _make_serializable(ts)
+        assert got == "2026-05-02T12:14:32+00:00"
+        json.dumps(got)  # would raise on raw datetime
+
+    def test_datetime_nested_in_tools_dict(self):
+        # Mirrors the production failure: tools_dict is a dict-of-dicts
+        # where each tool row has timestamp fields buried under string keys.
+        import json
+        from datetime import datetime, timezone
+        from application.api.answer.services.continuation_service import (
+            _make_serializable,
+        )
+
+        ts = datetime(2026, 5, 2, 12, 14, 32, tzinfo=timezone.utc)
+        tools_dict = {
+            "0": {
+                "name": "mcp_tool",
+                "actions": [{"name": "search", "active": True}],
+                "created_at": ts,
+                "updated_at": ts,
+            }
+        }
+        got = _make_serializable(tools_dict)
+        json.dumps(got)
+        assert got["0"]["created_at"] == "2026-05-02T12:14:32+00:00"
+
+    def test_date_becomes_iso_string(self):
+        from datetime import date
+        from application.api.answer.services.continuation_service import (
+            _make_serializable,
+        )
+        assert _make_serializable(date(2026, 5, 2)) == "2026-05-02"
+

 class TestContinuationServiceSaveLoad:
    def test_save_and_load_state(self, pg_conn):
--- a/tests/api/answer/services/test_conversation_service.py
+++ b/tests/api/answer/services/test_conversation_service.py
@@ -229,6 +229,489 @@ class TestConversationServiceSave:
        assert got["name"] == "q-fallback"


+class TestSaveUserQuestion:
+    def test_creates_conversation_and_reserves_message(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+            TERMINATED_RESPONSE_PLACEHOLDER,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-wal-new"
+        with _patch_db(pg_conn):
+            result = ConversationService().save_user_question(
+                conversation_id=None,
+                question="what is python?",
+                decoded_token={"sub": user},
+            )
+        assert result["conversation_id"]
+        assert result["message_id"]
+        assert result["request_id"]
+
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.get_any(result["conversation_id"], user)
+        assert conv is not None
+        messages = repo.get_messages(result["conversation_id"])
+        assert len(messages) == 1
+        assert messages[0]["status"] == "pending"
+        assert messages[0]["prompt"] == "what is python?"
+        assert messages[0]["response"] == TERMINATED_RESPONSE_PLACEHOLDER
+        assert messages[0]["request_id"] == result["request_id"]
+
+    def test_appends_to_existing_conversation(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-wal-existing"
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.create(user, name="hi")
+        conv_id = str(conv["id"])
+
+        with _patch_db(pg_conn):
+            result = ConversationService().save_user_question(
+                conversation_id=conv_id,
+                question="follow-up",
+                decoded_token={"sub": user},
+            )
+        assert result["conversation_id"] == conv_id
+        msgs = repo.get_messages(conv_id)
+        assert len(msgs) == 1
+        assert msgs[0]["prompt"] == "follow-up"
+
+    def test_raises_when_token_missing(self):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        with pytest.raises(ValueError):
+            ConversationService().save_user_question(
+                conversation_id=None,
+                question="q",
+                decoded_token=None,
+            )
+
+    def test_regenerate_at_index_replaces_old_message(self, pg_conn):
+        """Regenerate at ``index`` truncates the old message *and
+        everything after* before reserving the placeholder, so the new
+        WAL row lands at ``position=index`` rather than appending at
+        the end. Pre-fix the WAL path appended unconditionally and the
+        old answer survived alongside the regenerated one.
+        """
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-wal-regen"
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.create(user, name="regen-test")
+        conv_id = str(conv["id"])
+
+        # Seed five completed messages at positions 0..4.
+        for i in range(5):
+            repo.append_message(
+                conv_id,
+                {
+                    "prompt": f"q{i}",
+                    "response": f"a{i}",
+                    "thought": "",
+                    "sources": [],
+                    "tool_calls": [],
+                    "metadata": {},
+                },
+            )
+        seeded = repo.get_messages(conv_id)
+        assert len(seeded) == 5
+        assert [m["position"] for m in seeded] == [0, 1, 2, 3, 4]
+
+        with _patch_db(pg_conn):
+            result = ConversationService().save_user_question(
+                conversation_id=conv_id,
+                question="q3-regen",
+                decoded_token={"sub": user},
+                index=3,
+            )
+
+        msgs = repo.get_messages(conv_id)
+        # Positions 0,1,2 from the seed plus the new placeholder at 3.
+        assert [m["position"] for m in msgs] == [0, 1, 2, 3]
+        # The placeholder carries the regenerated prompt.
+        regen = next(m for m in msgs if m["position"] == 3)
+        assert regen["prompt"] == "q3-regen"
+        assert regen["status"] == "pending"
+        assert str(regen["id"]) == result["message_id"]
+        # The old answer at index 3 is gone.
+        assert not any(m["response"] == "a3" for m in msgs)
+        # And anything after index 3 was truncated.
+        assert not any(m["prompt"] == "q4" for m in msgs)
+
+    def test_regenerate_at_index_zero_truncates_everything(self, pg_conn):
+        """``index=0`` is a valid edge: it should drop every prior
+        message and reseat the placeholder at position 0.
+        """
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-wal-regen-zero"
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.create(user, name="regen-zero")
+        conv_id = str(conv["id"])
+        for i in range(3):
+            repo.append_message(
+                conv_id,
+                {
+                    "prompt": f"old-{i}",
+                    "response": f"old-a-{i}",
+                    "thought": "",
+                    "sources": [],
+                    "tool_calls": [],
+                    "metadata": {},
+                },
+            )
+
+        with _patch_db(pg_conn):
+            ConversationService().save_user_question(
+                conversation_id=conv_id,
+                question="fresh-from-start",
+                decoded_token={"sub": user},
+                index=0,
+            )
+
+        msgs = repo.get_messages(conv_id)
+        assert len(msgs) == 1
+        assert msgs[0]["position"] == 0
+        assert msgs[0]["prompt"] == "fresh-from-start"
+
+    def test_regenerate_index_ignored_without_conversation_id(self, pg_conn):
+        """``index`` only makes sense against an existing conversation;
+        the create-then-reserve path silently treats it as a no-op
+        rather than truncating a freshly-created conversation.
+        """
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-wal-regen-no-conv"
+        with _patch_db(pg_conn):
+            result = ConversationService().save_user_question(
+                conversation_id=None,
+                question="brand new q",
+                decoded_token={"sub": user},
+                index=2,
+            )
+
+        repo = ConversationsRepository(pg_conn)
+        msgs = repo.get_messages(result["conversation_id"])
+        assert len(msgs) == 1
+        assert msgs[0]["position"] == 0
+        assert msgs[0]["prompt"] == "brand new q"
+
+    def test_raises_when_conversation_unauthorized(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        with _patch_db(pg_conn), pytest.raises(ValueError):
+            ConversationService().save_user_question(
+                conversation_id="00000000-0000-0000-0000-000000000000",
+                question="q",
+                decoded_token={"sub": "u"},
+            )
+
+
+class TestFinalizeMessage:
+    def test_finalizes_complete(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-fin-ok"
+        with _patch_db(pg_conn):
+            svc = ConversationService()
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="q",
+                decoded_token={"sub": user},
+            )
+            assert svc.finalize_message(
+                res["message_id"],
+                "real answer",
+                thought="thinking",
+                sources=[{"text": "x" * 2000, "title": "doc"}],
+                tool_calls=[{"name": "search"}],
+                model_id="gpt-4",
+                metadata={"foo": "bar"},
+                status="complete",
+            ) is True
+
+        msgs = ConversationsRepository(pg_conn).get_messages(
+            res["conversation_id"],
+        )
+        assert msgs[0]["response"] == "real answer"
+        assert msgs[0]["status"] == "complete"
+        assert msgs[0]["thought"] == "thinking"
+        assert msgs[0]["model_id"] == "gpt-4"
+        # source text trimmed to 1000 chars at finalize time
+        assert len(msgs[0]["sources"][0]["text"]) == 1000
+        assert msgs[0]["metadata"]["foo"] == "bar"
+
+    def test_finalizes_failed_records_error(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-fin-fail"
+        with _patch_db(pg_conn):
+            svc = ConversationService()
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="q",
+                decoded_token={"sub": user},
+            )
+            err = RuntimeError("provider down")
+            assert svc.finalize_message(
+                res["message_id"],
+                "fallback text",
+                status="failed",
+                error=err,
+            ) is True
+
+        msgs = ConversationsRepository(pg_conn).get_messages(
+            res["conversation_id"],
+        )
+        assert msgs[0]["status"] == "failed"
+        assert msgs[0]["metadata"]["error"] == "RuntimeError: provider down"
+
+    def test_finalize_flips_executed_tool_calls(self, pg_conn):
+        """finalize_message must mark tool_call_attempts.status='executed'
+        rows as 'confirmed' for the same message_id."""
+        from sqlalchemy import text as sql_text
+
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+
+        user = "u-fin-tools"
+        with _patch_db(pg_conn):
+            svc = ConversationService()
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="q",
+                decoded_token={"sub": user},
+            )
+            pg_conn.execute(
+                sql_text(
+                    "INSERT INTO tool_call_attempts "
+                    "(call_id, message_id, tool_name, action_name, arguments, status) "
+                    "VALUES (:cid, CAST(:mid AS uuid), 't', 'a', '{}'::jsonb, 'executed')"
+                ),
+                {"cid": "c1", "mid": res["message_id"]},
+            )
+            assert svc.finalize_message(
+                res["message_id"], "ans", status="complete",
+            ) is True
+
+        status = pg_conn.execute(
+            sql_text("SELECT status FROM tool_call_attempts WHERE call_id = :cid"),
+            {"cid": "c1"},
+        ).scalar()
+        assert status == "confirmed"
+
+    def test_finalize_returns_false_for_unknown_message(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        with _patch_db(pg_conn):
+            assert ConversationService().finalize_message(
+                "00000000-0000-0000-0000-000000000000",
+                "x",
+                status="complete",
+            ) is False
+
+    def test_finalize_rolls_back_tool_call_confirm_on_message_update_failure(
+        self, pg_conn
+    ):
+        """Atomicity: if ``update_message_by_id`` raises after the
+        tool_call_attempts confirm ran on the same connection, the
+        confirm rolls back with the rest of the transaction. The
+        ``pg_conn`` fixture pins one connection inside an outer
+        rolled-back transaction; we patch ``db_session`` to wrap each
+        call in a SAVEPOINT so the production-code ``with`` block
+        actually rolls back when the message-update raises.
+        """
+        from contextlib import contextmanager
+
+        from sqlalchemy import text as sql_text
+
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories import conversations as conv_module
+
+        user = "u-fin-rollback"
+
+        @contextmanager
+        def _savepoint_session():
+            nested = pg_conn.begin_nested()
+            try:
+                yield pg_conn
+                nested.commit()
+            except Exception:
+                nested.rollback()
+                raise
+
+        with patch(
+            "application.api.answer.services.conversation_service.db_session",
+            _savepoint_session,
+        ), patch(
+            "application.api.answer.services.conversation_service.db_readonly",
+            _savepoint_session,
+        ):
+            svc = ConversationService()
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="q",
+                decoded_token={"sub": user},
+            )
+            pg_conn.execute(
+                sql_text(
+                    "INSERT INTO tool_call_attempts "
+                    "(call_id, message_id, tool_name, action_name, "
+                    "arguments, status) VALUES (:cid, CAST(:mid AS uuid), "
+                    "'t', 'a', '{}'::jsonb, 'executed')"
+                ),
+                {"cid": "rb-1", "mid": res["message_id"]},
+            )
+            original = conv_module.ConversationsRepository.update_message_by_id
+
+            def boom(self, *args, **kwargs):
+                _ = (args, kwargs)
+                raise RuntimeError("simulated message-update failure")
+
+            conv_module.ConversationsRepository.update_message_by_id = boom
+            try:
+                with pytest.raises(RuntimeError):
+                    svc.finalize_message(
+                        res["message_id"], "answer", status="complete",
+                    )
+            finally:
+                conv_module.ConversationsRepository.update_message_by_id = original
+
+        # The tool_call confirm rolled back: row stays at ``executed``.
+        status = pg_conn.execute(
+            sql_text(
+                "SELECT status FROM tool_call_attempts WHERE call_id = :cid"
+            ),
+            {"cid": "rb-1"},
+        ).scalar()
+        assert status == "executed"
+        msg_status = pg_conn.execute(
+            sql_text(
+                "SELECT status FROM conversation_messages "
+                "WHERE id = CAST(:mid AS uuid)"
+            ),
+            {"mid": res["message_id"]},
+        ).scalar()
+        assert msg_status == "pending"
+
+    def test_finalize_generates_title_when_provided(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-fin-title"
+        mock_llm = MagicMock()
+        mock_llm.gen.return_value = "Short Title"
+        with _patch_db(pg_conn):
+            svc = ConversationService()
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="long question that becomes the fallback name",
+                decoded_token={"sub": user},
+            )
+            assert svc.finalize_message(
+                res["message_id"],
+                "answer",
+                status="complete",
+                title_inputs={
+                    "llm": mock_llm,
+                    "question": "long question that becomes the fallback name",
+                    "response": "answer",
+                    "model_id": "gpt-4",
+                    "fallback_name": (
+                        "long question that becomes the fallback name"[:50]
+                    ),
+                },
+            ) is True
+
+        repo = ConversationsRepository(pg_conn)
+        conv = repo.get_any(res["conversation_id"], user)
+        assert conv["name"] == "Short Title"
+
+
+class TestSaveUserQuestionFinalizeFailedFlow:
+    """LLM fails immediately; question stays queryable with status='failed' + error metadata."""
+
+    def test_failed_llm_leaves_question_persisted(self, pg_conn):
+        from application.api.answer.services.conversation_service import (
+            ConversationService,
+        )
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        user = "u-acceptance"
+        with _patch_db(pg_conn):
+            svc = ConversationService()
+            # Simulates the WAL pre-persist before LLM call.
+            res = svc.save_user_question(
+                conversation_id=None,
+                question="why did this fail?",
+                decoded_token={"sub": user},
+            )
+            # Simulates the LLM raising immediately, caught by complete_stream.
+            try:
+                raise RuntimeError("upstream 503")
+            except RuntimeError as e:
+                svc.finalize_message(
+                    res["message_id"],
+                    "",
+                    status="failed",
+                    error=e,
+                )
+
+        msgs = ConversationsRepository(pg_conn).get_messages(
+            res["conversation_id"],
+        )
+        assert len(msgs) == 1
+        assert msgs[0]["prompt"] == "why did this fail?"
+        assert msgs[0]["status"] == "failed"
+        assert "RuntimeError" in msgs[0]["metadata"]["error"]
+        assert "upstream 503" in msgs[0]["metadata"]["error"]
+
+
 class TestCompressionMetadata:
    def test_update_compression_metadata(self, pg_conn):
        from application.api.answer.services.conversation_service import (
--- a/tests/api/answer/services/test_token_usage_attribution.py
+++ b/tests/api/answer/services/test_token_usage_attribution.py
@@ -0,0 +1,238 @@
+"""Token-usage attribution tests for the always-on inline-persist model.
+
+Persistence is owned by the per-call decorator in ``application.usage``.
+``finalize_message`` no longer writes ``token_usage`` rows. These tests
+exercise the decorator path through ``stream_token_usage`` /
+``gen_token_usage``:
+
+1. Every LLM call writes one row, regardless of whether the route saves
+   the conversation.
+2. ``_token_usage_source`` on the LLM instance flows to the row's
+   ``source`` column for cost-attribution dashboards.
+3. ``_request_id`` on the LLM instance flows to the row's ``request_id``
+   column so ``count_in_range`` can DISTINCT-collapse multi-call agent
+   runs into a single request.
+4. Calls with no attribution (no ``user_id`` and no ``user_api_key``)
+   warn and skip — the repository would otherwise raise on the
+   ``token_usage_attribution_chk`` constraint.
+"""
+
+from __future__ import annotations
+
+import logging
+import uuid
+from contextlib import contextmanager
+from unittest.mock import patch
+
+import pytest
+from sqlalchemy import text
+
+
+@contextmanager
+def _patch_db_session_for(modules, conn):
+    """Reroute every named module's ``db_session`` to ``conn``."""
+
+    @contextmanager
+    def _yield():
+        yield conn
+
+    patches = [patch(f"{m}.db_session", _yield) for m in modules]
+    for p in patches:
+        p.start()
+    try:
+        yield
+    finally:
+        for p in patches:
+            p.stop()
+
+
+def _seed_user(conn) -> str:
+    user_id = str(uuid.uuid4())
+    conn.execute(
+        text(
+            "INSERT INTO users (user_id) VALUES (:u) "
+            "ON CONFLICT (user_id) DO NOTHING"
+        ),
+        {"u": user_id},
+    )
+    return user_id
+
+
+@pytest.mark.unit
+class TestDecoratorAlwaysPersists:
+    """Per-call inline persistence — no opt-in flag."""
+
+    def test_primary_stream_writes_agent_stream_row(self, pg_conn):
+        from application.usage import stream_token_usage
+
+        user_id = _seed_user(pg_conn)
+
+        class _PrimaryLLM:
+            decoded_token = {"sub": user_id}
+            user_api_key = None
+            agent_id = None
+
+            def __init__(self):
+                self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0}
+
+            @stream_token_usage
+            def _raw(self, model, messages, stream, tools, **kwargs):
+                yield "chunk-a"
+                yield "chunk-b"
+
+        llm = _PrimaryLLM()
+        with _patch_db_session_for(("application.usage",), pg_conn):
+            for _ in llm._raw(
+                "m", [{"role": "user", "content": "hi"}], True, None,
+            ):
+                pass
+
+        row = pg_conn.execute(
+            text(
+                "SELECT prompt_tokens, generated_tokens, source, request_id "
+                "FROM token_usage WHERE user_id = :u"
+            ),
+            {"u": user_id},
+        ).fetchone()
+        assert row is not None
+        assert row[2] == "agent_stream"
+        assert row[3] is None  # No request_id stamped on this LLM.
+        assert row[0] > 0
+        assert row[1] > 0
+
+    def test_side_channel_source_flows_to_row(self, pg_conn):
+        """``_token_usage_source`` overrides the default ``agent_stream``."""
+        from application.usage import stream_token_usage
+
+        user_id = _seed_user(pg_conn)
+
+        class _RagLLM:
+            decoded_token = {"sub": user_id}
+            user_api_key = None
+            agent_id = None
+            _token_usage_source = "rag_condense"
+
+            def __init__(self):
+                self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0}
+
+            @stream_token_usage
+            def _raw(self, model, messages, stream, tools, **kwargs):
+                yield "chunk"
+
+        llm = _RagLLM()
+        with _patch_db_session_for(("application.usage",), pg_conn):
+            for _ in llm._raw("m", [{"role": "user", "content": "q"}], True, None):
+                pass
+
+        row = pg_conn.execute(
+            text(
+                "SELECT source FROM token_usage WHERE user_id = :u"
+            ),
+            {"u": user_id},
+        ).fetchone()
+        assert row is not None
+        assert row[0] == "rag_condense"
+
+    def test_request_id_propagates_to_row(self, pg_conn):
+        """``_request_id`` on the LLM (stamped by the route) lands in
+        ``token_usage.request_id`` so ``count_in_range`` can DISTINCT it.
+        """
+        from application.usage import stream_token_usage
+
+        user_id = _seed_user(pg_conn)
+        request_id = f"req-{uuid.uuid4().hex[:12]}"
+
+        class _PrimaryLLM:
+            decoded_token = {"sub": user_id}
+            user_api_key = None
+            agent_id = None
+
+            def __init__(self):
+                self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0}
+                self._request_id = request_id
+
+            @stream_token_usage
+            def _raw(self, model, messages, stream, tools, **kwargs):
+                yield "chunk"
+
+        llm = _PrimaryLLM()
+        with _patch_db_session_for(("application.usage",), pg_conn):
+            # Call twice — the route invokes the LLM once per tool round.
+            for _ in llm._raw("m", [{"role": "user", "content": "q"}], True, None):
+                pass
+            for _ in llm._raw("m", [{"role": "user", "content": "q2"}], True, None):
+                pass
+
+        rows = pg_conn.execute(
+            text(
+                "SELECT request_id FROM token_usage WHERE user_id = :u"
+            ),
+            {"u": user_id},
+        ).fetchall()
+        assert len(rows) == 2
+        assert all(r[0] == request_id for r in rows)
+
+    def test_zero_count_call_is_skipped(self, pg_conn):
+        from application.usage import gen_token_usage
+
+        user_id = _seed_user(pg_conn)
+
+        class _EmptyLLM:
+            decoded_token = {"sub": user_id}
+            user_api_key = None
+            agent_id = None
+
+            def __init__(self):
+                self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0}
+
+            @gen_token_usage
+            def _raw(self, model, messages, stream, tools, **kwargs):
+                return None  # empty result → 0 generated tokens, 0 prompt tokens
+
+        llm = _EmptyLLM()
+        with _patch_db_session_for(("application.usage",), pg_conn):
+            llm._raw("m", [], False, None)
+
+        n = pg_conn.execute(
+            text("SELECT count(*) FROM token_usage WHERE user_id = :u"),
+            {"u": user_id},
+        ).scalar()
+        assert n == 0
+
+    def test_no_attribution_warns_and_skips(self, pg_conn, caplog):
+        """No user_id and no api_key → log a warning, don't insert.
+
+        The repository would otherwise raise on the attribution CHECK
+        constraint; the decorator skips before that to keep the stream
+        running.
+        """
+        from application.usage import stream_token_usage
+
+        class _OrphanLLM:
+            decoded_token = None
+            user_api_key = None
+            agent_id = None
+
+            def __init__(self):
+                self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0}
+
+            @stream_token_usage
+            def _raw(self, model, messages, stream, tools, **kwargs):
+                yield "chunk"
+
+        llm = _OrphanLLM()
+        with _patch_db_session_for(
+            ("application.usage",), pg_conn,
+        ), caplog.at_level(logging.WARNING, logger="application.usage"):
+            for _ in llm._raw("m", [{"role": "user", "content": "q"}], True, None):
+                pass
+
+        n = pg_conn.execute(text("SELECT count(*) FROM token_usage")).scalar()
+        # New attribution rows specifically for this orphan path: nothing
+        # should land. The fixture pins state, so an existing baseline is
+        # 0 by default.
+        assert n == 0
+        assert any(
+            "no user_id/api_key" in r.message
+            for r in caplog.records
+        )
--- a/tests/api/answer/test_base_routes.py
+++ b/tests/api/answer/test_base_routes.py
@@ -11,7 +11,8 @@ Additional coverage beyond tests/api/answer/routes/test_base.py:

 import json
 import uuid
-from unittest.mock import MagicMock
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch

 import pytest

@@ -368,6 +369,11 @@ class TestCompleteStreamCompressionMetadata:

            resource.conversation_service = MagicMock()
            resource.conversation_service.save_conversation.return_value = "conv123"
+            resource.conversation_service.save_user_question.return_value = {
+                "conversation_id": "conv123",
+                "message_id": "msg123",
+                "request_id": "req123",
+            }

            stream = list(
                resource.complete_stream(
@@ -404,6 +410,11 @@ class TestCompleteStreamCompressionMetadata:

            resource.conversation_service = MagicMock()
            resource.conversation_service.save_conversation.return_value = "conv123"
+            resource.conversation_service.save_user_question.return_value = {
+                "conversation_id": "conv123",
+                "message_id": "msg123",
+                "request_id": "req123",
+            }
            resource.conversation_service.update_compression_metadata.side_effect = (
                Exception("db error")
            )
@@ -478,6 +489,11 @@ class TestCompleteStreamGeneratorExit:

            resource.conversation_service = MagicMock()
            resource.conversation_service.save_conversation.return_value = "conv1"
+            resource.conversation_service.save_user_question.return_value = {
+                "conversation_id": "conv1",
+                "message_id": "msg1",
+                "request_id": "req1",
+            }

            gen = resource.complete_stream(
                question="Q",
@@ -489,7 +505,9 @@ class TestCompleteStreamGeneratorExit:
                model_id="gpt-4",
            )

-            # Read first chunk and then close (simulating client disconnect)
+            # Drain the early ``message_id`` event before reading the
+            # ``partial`` chunk that this test is asserting on.
+            next(gen)
            chunk = next(gen)
            assert "partial" in chunk
            gen.close()  # This triggers GeneratorExit
@@ -512,6 +530,11 @@ class TestCompleteStreamGeneratorExit:

            resource.conversation_service = MagicMock()
            resource.conversation_service.save_conversation.return_value = "conv1"
+            resource.conversation_service.save_user_question.return_value = {
+                "conversation_id": "conv1",
+                "message_id": "msg1",
+                "request_id": "req1",
+            }

            gen = resource.complete_stream(
                question="Q",
@@ -524,6 +547,8 @@ class TestCompleteStreamGeneratorExit:
                isNoneDoc=True,
            )

+            # Skip past the early ``message_id`` event.
+            next(gen)
            next(gen)
            gen.close()

@@ -547,6 +572,11 @@ class TestCompleteStreamGeneratorExit:
            resource.conversation_service.save_conversation.side_effect = Exception(
                "save error"
            )
+            resource.conversation_service.save_user_question.return_value = {
+                "conversation_id": "conv1",
+                "message_id": "msg1",
+                "request_id": "req1",
+            }

            gen = resource.complete_stream(
                question="Q",
@@ -558,5 +588,127 @@ class TestCompleteStreamGeneratorExit:
                model_id="gpt-4",
            )

+            # Skip past the early ``message_id`` event.
+            next(gen)
            next(gen)
            gen.close()  # Should not crash even with save error
+
+
+@contextmanager
+def _patch_db_session(conn):
+    @contextmanager
+    def _yield():
+        yield conn
+
+    with patch(
+        "application.api.answer.services.conversation_service.db_session",
+        _yield,
+    ), patch(
+        "application.api.answer.services.conversation_service.db_readonly",
+        _yield,
+    ):
+        yield
+
+
+@pytest.mark.unit
+class TestCompleteStreamWalAcceptance:
+    """Acceptance for the WAL pre-persist behaviour: when the LLM raises
+    immediately, the user question is still queryable from PG with
+    status='failed' and a meaningful error in metadata."""
+
+    def test_failed_llm_persists_question_with_failed_status(
+        self, pg_conn, flask_app,
+    ):
+        from application.api.answer.routes.base import BaseAnswerResource
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        with flask_app.app_context():
+            resource = BaseAnswerResource()
+
+            mock_agent = MagicMock()
+            mock_agent.gen.side_effect = RuntimeError("LLM upstream failed")
+
+            with _patch_db_session(pg_conn):
+                stream = list(
+                    resource.complete_stream(
+                        question="why does the WAL matter?",
+                        agent=mock_agent,
+                        conversation_id=None,
+                        user_api_key=None,
+                        decoded_token={"sub": "u-acceptance"},
+                        should_save_conversation=True,
+                        model_id="gpt-4",
+                    )
+                )
+            error_chunks = [s for s in stream if '"type": "error"' in s]
+            assert len(error_chunks) == 1
+
+            from sqlalchemy import text as sql_text
+            convs = pg_conn.execute(
+                sql_text("SELECT id FROM conversations WHERE user_id = :u"),
+                {"u": "u-acceptance"},
+            ).fetchall()
+            assert len(convs) == 1
+            conv_id = str(convs[0][0])
+            msgs = ConversationsRepository(pg_conn).get_messages(conv_id)
+            assert len(msgs) == 1
+            assert msgs[0]["prompt"] == "why does the WAL matter?"
+            assert msgs[0]["status"] == "failed"
+            assert "RuntimeError" in msgs[0]["metadata"]["error"]
+            assert "LLM upstream failed" in msgs[0]["metadata"]["error"]
+
+    def test_request_id_consistent_across_sse_event_and_wal_row(
+        self, pg_conn, flask_app,
+    ):
+        """The early ``message_id`` SSE event reports the same
+        ``request_id`` that ``save_user_question`` writes onto the WAL
+        row, so client-side correlation, ``token_usage`` joins, and
+        ``count_in_range``'s DISTINCT all line up.
+        """
+        from application.api.answer.routes.base import BaseAnswerResource
+        from application.storage.db.repositories.conversations import (
+            ConversationsRepository,
+        )
+
+        with flask_app.app_context():
+            resource = BaseAnswerResource()
+            mock_agent = MagicMock()
+            mock_agent.gen.return_value = iter([{"answer": "ok"}])
+            mock_agent.tool_calls = []
+            mock_agent.compression_metadata = None
+            mock_agent.compression_saved = False
+
+            with _patch_db_session(pg_conn):
+                stream = list(
+                    resource.complete_stream(
+                        question="hello",
+                        agent=mock_agent,
+                        conversation_id=None,
+                        user_api_key=None,
+                        decoded_token={"sub": "u-request-id"},
+                        should_save_conversation=True,
+                        model_id="gpt-4",
+                    )
+                )
+
+            sse_events = [
+                json.loads(s.replace("data: ", "").strip())
+                for s in stream
+                if s.startswith("data: ")
+            ]
+            early_events = [e for e in sse_events if e.get("type") == "message_id"]
+            assert len(early_events) == 1
+            sse_request_id = early_events[0]["request_id"]
+            assert sse_request_id
+
+            from sqlalchemy import text as sql_text
+            convs = pg_conn.execute(
+                sql_text("SELECT id FROM conversations WHERE user_id = :u"),
+                {"u": "u-request-id"},
+            ).fetchall()
+            assert len(convs) == 1
+            msgs = ConversationsRepository(pg_conn).get_messages(str(convs[0][0]))
+            assert len(msgs) == 1
+            assert msgs[0]["request_id"] == sse_request_id
--- a/tests/api/user/agents/test_routes_happy.py
+++ b/tests/api/user/agents/test_routes_happy.py
@@ -710,6 +710,91 @@ class TestUpdateAgent:
            response = UpdateAgent().put(str(agent["id"]))
        assert response.status_code == 400

+    def test_publish_with_default_source_succeeds(self, app, pg_conn):
+        """The frontend's auto-selected "Default" source has no UUID — it
+        sends ``source: ''`` and ``sources: []`` with ``retriever: 'classic'``.
+        Pre-fix, the publish gate rejected this with
+        ``Missing or invalid required fields: Source``. The retriever
+        carries the runtime identity, so the gate now accepts it.
+        """
+        from application.api.user.agents.routes import UpdateAgent
+        from application.storage.db.repositories.agents import AgentsRepository
+        from application.storage.db.repositories.prompts import PromptsRepository
+
+        user = "u-publish-default"
+        # Draft agent with retriever='classic', no source_id, prompt + chunks set.
+        agent = _seed_agent(
+            pg_conn, user=user, status="draft",
+            with_source=False, retriever="classic",
+        )
+        prompt = PromptsRepository(pg_conn).create(user, "p", "c")
+        AgentsRepository(pg_conn).update(
+            str(agent["id"]), user,
+            {
+                "prompt_id": str(prompt["id"]),
+                "chunks": 2,
+                "agent_type": "classic",
+            },
+        )
+
+        with _patch_db(pg_conn), app.test_request_context(
+            f"/api/update_agent/{agent['id']}", method="PUT",
+            json={
+                "name": "n",
+                "description": "d",
+                "status": "published",
+                "source": "",
+                "sources": [],
+                "retriever": "classic",
+            },
+        ):
+            from flask import request
+            request.decoded_token = {"sub": user}
+            response = UpdateAgent().put(str(agent["id"]))
+        assert response.status_code == 200, (
+            f"unexpected error: {response.json}"
+        )
+
+    def test_publish_without_source_or_retriever_returns_400(
+        self, app, pg_conn,
+    ):
+        """If neither a source nor a retriever is configured, the gate
+        still trips — the agent has no way to retrieve anything."""
+        from application.api.user.agents.routes import UpdateAgent
+        from application.storage.db.repositories.agents import AgentsRepository
+        from application.storage.db.repositories.prompts import PromptsRepository
+
+        user = "u-publish-no-retriever"
+        agent = _seed_agent(
+            pg_conn, user=user, status="draft",
+            with_source=False, retriever="",
+        )
+        prompt = PromptsRepository(pg_conn).create(user, "p", "c")
+        AgentsRepository(pg_conn).update(
+            str(agent["id"]), user,
+            {
+                "prompt_id": str(prompt["id"]),
+                "chunks": 2,
+                "agent_type": "classic",
+            },
+        )
+
+        with _patch_db(pg_conn), app.test_request_context(
+            f"/api/update_agent/{agent['id']}", method="PUT",
+            json={
+                "name": "n",
+                "description": "d",
+                "status": "published",
+                "source": "",
+                "sources": [],
+            },
+        ):
+            from flask import request
+            request.decoded_token = {"sub": user}
+            response = UpdateAgent().put(str(agent["id"]))
+        assert response.status_code == 400
+        assert "Source or retriever" in response.json.get("message", "")
+
    def test_publishing_generates_api_key(self, app, pg_conn):
        from application.api.user.agents.routes import UpdateAgent
        from application.storage.db.repositories.agents import AgentsRepository
--- a/tests/api/user/agents/test_webhooks_gaps.py
+++ b/tests/api/user/agents/test_webhooks_gaps.py
@@ -55,7 +55,7 @@ class TestAgentWebhookListenerGaps:
        with patch(
            "application.api.user.agents.webhooks.process_agent_webhook"
        ) as mock_process:
-            mock_process.delay.return_value = mock_task
+            mock_process.apply_async.return_value = mock_task
            with app.test_request_context(
                "/api/webhooks/agents/tok",
                method="POST",
@@ -77,7 +77,7 @@ class TestAgentWebhookListenerGaps:
        with patch(
            "application.api.user.agents.webhooks.process_agent_webhook"
        ) as mock_process:
-            mock_process.delay.return_value = mock_task
+            mock_process.apply_async.return_value = mock_task
            with app.test_request_context(
                "/api/webhooks/agents/tok",
                method="GET",
@@ -90,8 +90,9 @@ class TestAgentWebhookListenerGaps:
                )

        assert response.status_code == 200
-        call_kwargs = mock_process.delay.call_args[1]
-        assert call_kwargs["payload"] == {}
+        call_kwargs = mock_process.apply_async.call_args[1]
+        # apply_async wraps task args under ``kwargs=``.
+        assert call_kwargs["kwargs"]["payload"] == {}

    def test_enqueue_returns_task_id_in_response(self, app):
        """Success response body includes task_id from the Celery task."""
@@ -103,7 +104,7 @@ class TestAgentWebhookListenerGaps:
        with patch(
            "application.api.user.agents.webhooks.process_agent_webhook"
        ) as mock_process:
-            mock_process.delay.return_value = mock_task
+            mock_process.apply_async.return_value = mock_task
            with app.test_request_context(
                "/api/webhooks/agents/tok",
                method="POST",
@@ -122,7 +123,7 @@ class TestAgentWebhookListenerGaps:
        with patch(
            "application.api.user.agents.webhooks.process_agent_webhook"
        ) as mock_process:
-            mock_process.delay.side_effect = RuntimeError("celery is down")
+            mock_process.apply_async.side_effect = RuntimeError("celery is down")
            with app.test_request_context(
                "/api/webhooks/agents/tok",
                method="POST",
@@ -254,7 +255,7 @@ class TestAgentWebhookListener:
        fake_task = MagicMock(id="task-post")

        with patch(
-            "application.api.user.agents.webhooks.process_agent_webhook.delay",
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/webhooks/agents/tk-enq", method="POST",
@@ -282,7 +283,7 @@ class TestAgentWebhookListener:
        fake_task = MagicMock(id="task-get")

        with patch(
-            "application.api.user.agents.webhooks.process_agent_webhook.delay",
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/webhooks/agents/tk-get?foo=bar&baz=42"
--- a/tests/api/user/agents/test_webhooks_idempotency.py
+++ b/tests/api/user/agents/test_webhooks_idempotency.py
@@ -0,0 +1,407 @@
+"""Idempotency-Key behavior on the agent webhook listener route."""
+
+from contextlib import contextmanager
+from unittest.mock import MagicMock, patch
+
+import pytest
+from flask import Flask
+
+
+@pytest.fixture
+def app():
+    return Flask(__name__)
+
+
+@contextmanager
+def _patch_db(conn):
+    @contextmanager
+    def _yield():
+        yield conn
+
+    with patch(
+        "application.api.user.agents.webhooks.db_session", _yield
+    ), patch(
+        "application.api.user.agents.webhooks.db_readonly", _yield
+    ):
+        yield
+
+
+def _seed_agent(pg_conn, user="u", token="tk", **kw):
+    from application.storage.db.repositories.agents import AgentsRepository
+    return AgentsRepository(pg_conn).create(
+        user, "a", "published", incoming_webhook_token=token, **kw,
+    )
+
+
+def _apply_async_mock():
+    """Mock for ``process_agent_webhook.apply_async``; ``task.id`` mirrors the predetermined id."""
+    def _side_effect(*args, **kwargs):
+        return MagicMock(id=kwargs.get("task_id") or "auto-task-id")
+    return MagicMock(side_effect=_side_effect)
+
+
+class TestWebhookIdempotency:
+    def test_no_header_enqueues_normally(self, app, pg_conn):
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-noh", token="tk-noh")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-noh", method="POST",
+            json={"event": "x"},
+        ):
+            listener = AgentWebhookListener()
+            response = listener.post(
+                webhook_token="tk-noh",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert response.status_code == 200
+        assert apply_mock.call_count == 1
+
+    def test_header_first_post_records_row(self, app, pg_conn):
+        from sqlalchemy import text
+
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-first", token="tk-first")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-first", method="POST",
+            json={"event": "x"},
+            headers={"Idempotency-Key": "key-abc"},
+        ):
+            listener = AgentWebhookListener()
+            response = listener.post(
+                webhook_token="tk-first",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert response.status_code == 200
+        assert apply_mock.call_count == 1
+        predetermined_id = apply_mock.call_args.kwargs["task_id"]
+        assert response.json["task_id"] == predetermined_id
+
+        # Stored under the *scoped* form ``"{agent_id}:{key}"`` so two
+        # agents sharing the same raw header don't collapse on PK.
+        scoped_key = f"{agent['id']}:key-abc"
+        row = pg_conn.execute(
+            text("SELECT task_id, agent_id FROM webhook_dedup WHERE idempotency_key = :k"),
+            {"k": scoped_key},
+        ).fetchone()
+        assert row is not None
+        assert row[0] == predetermined_id
+        assert str(row[1]) == str(agent["id"])
+
+    def test_header_forwards_idempotency_key_to_delay(self, app, pg_conn):
+        """The Celery task body needs the key so ``with_idempotency`` can
+        record terminal status and ``_derive_source_id`` can pick it up.
+        """
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-fwd", token="tk-fwd")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-fwd", method="POST",
+            json={"event": "x"},
+            headers={"Idempotency-Key": "key-fwd"},
+        ):
+            listener = AgentWebhookListener()
+            listener.post(
+                webhook_token="tk-fwd",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        # Worker sees the agent-scoped form so its dedup row is also
+        # agent-distinct.
+        scoped_key = f"{agent['id']}:key-fwd"
+        assert (
+            apply_mock.call_args.kwargs["kwargs"]["idempotency_key"]
+            == scoped_key
+        )
+
+    def test_same_header_second_post_returns_cached(self, app, pg_conn):
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-rep", token="tk-rep")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ):
+            with app.test_request_context(
+                "/api/webhooks/agents/tk-rep", method="POST",
+                json={"event": "x"},
+                headers={"Idempotency-Key": "key-rep"},
+            ):
+                listener = AgentWebhookListener()
+                first = listener.post(
+                    webhook_token="tk-rep",
+                    agent=agent,
+                    agent_id_str=str(agent["id"]),
+                )
+            with app.test_request_context(
+                "/api/webhooks/agents/tk-rep", method="POST",
+                json={"event": "x"},
+                headers={"Idempotency-Key": "key-rep"},
+            ):
+                listener = AgentWebhookListener()
+                second = listener.post(
+                    webhook_token="tk-rep",
+                    agent=agent,
+                    agent_id_str=str(agent["id"]),
+                )
+
+        assert first.status_code == 200
+        assert second.status_code == 200
+        assert first.json == second.json
+        assert apply_mock.call_count == 1
+
+    def test_concurrent_same_key_only_one_apply_async(self, app, pg_engine):
+        """Race test (M3): N parallel webhook POSTs with same key → only ONE apply_async.
+
+        Uses ``pg_engine`` so each thread checks out its own DB connection
+        (sharing a single Connection serializes at the driver level).
+        """
+        from concurrent.futures import ThreadPoolExecutor
+        from contextlib import contextmanager
+
+        from application.api.user.agents.webhooks import AgentWebhookListener
+        from application.storage.db.repositories.agents import AgentsRepository
+
+        with pg_engine.begin() as conn:
+            agent = AgentsRepository(conn).create(
+                "u-race", "a", "published", incoming_webhook_token="tk-race",
+            )
+
+        apply_mock = _apply_async_mock()
+
+        @contextmanager
+        def _engine_session():
+            with pg_engine.begin() as conn:
+                yield conn
+
+        @contextmanager
+        def _engine_readonly():
+            with pg_engine.connect() as conn:
+                yield conn
+
+        def fire(idx):
+            # Patches sit outside the thread pool (see below); only the
+            # per-thread Flask request context is set up inside.
+            with app.test_request_context(
+                "/api/webhooks/agents/tk-race", method="POST",
+                json={"event": idx},
+                headers={"Idempotency-Key": "wh-race"},
+            ):
+                listener = AgentWebhookListener()
+                return listener.post(
+                    webhook_token="tk-race",
+                    agent=agent,
+                    agent_id_str=str(agent["id"]),
+                )
+
+        # ``unittest.mock.patch`` is not thread-safe; set up
+        # module-attribute patches once before fanning out so every
+        # thread sees the mock instead of racing on save/restore.
+        with patch(
+            "application.api.user.agents.webhooks.db_session",
+            _engine_session,
+        ), patch(
+            "application.api.user.agents.webhooks.db_readonly",
+            _engine_readonly,
+        ), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), ThreadPoolExecutor(max_workers=8) as ex:
+            responses = list(ex.map(fire, range(8)))
+        assert all(r.status_code == 200 for r in responses)
+        assert apply_mock.call_count == 1
+        ids = {r.json["task_id"] for r in responses}
+        assert len(ids) == 1
+        assert "deduplicated" not in ids
+
+    def test_same_key_different_agent_does_not_collide(self, app, pg_conn):
+        """Idempotency keys are now scoped by ``agent_id`` — two agents
+        sending the same raw header each get their own dedup row, both
+        requests enqueue work, and the responses carry distinct
+        ``task_id``s. (Pre-fix, the second agent's request was silently
+        deduplicated against the first agent's row.)
+        """
+        from sqlalchemy import text as sql_text
+
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent_a = _seed_agent(pg_conn, user="u-a", token="tk-a")
+        agent_b = _seed_agent(pg_conn, user="u-b", token="tk-b")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ):
+            with app.test_request_context(
+                "/api/webhooks/agents/tk-a", method="POST",
+                json={"event": "x"},
+                headers={"Idempotency-Key": "global-key"},
+            ):
+                listener = AgentWebhookListener()
+                first = listener.post(
+                    webhook_token="tk-a",
+                    agent=agent_a,
+                    agent_id_str=str(agent_a["id"]),
+                )
+            with app.test_request_context(
+                "/api/webhooks/agents/tk-b", method="POST",
+                json={"event": "x"},
+                headers={"Idempotency-Key": "global-key"},
+            ):
+                listener = AgentWebhookListener()
+                second = listener.post(
+                    webhook_token="tk-b",
+                    agent=agent_b,
+                    agent_id_str=str(agent_b["id"]),
+                )
+
+        assert first.status_code == 200
+        assert second.status_code == 200
+        assert first.json["task_id"] != second.json["task_id"]
+        assert apply_mock.call_count == 2
+
+        # And there are two ``webhook_dedup`` rows: one per agent scope.
+        rows = pg_conn.execute(
+            sql_text(
+                "SELECT idempotency_key, agent_id FROM webhook_dedup "
+                "WHERE idempotency_key LIKE :pat ORDER BY idempotency_key"
+            ),
+            {"pat": "%:global-key"},
+        ).fetchall()
+        assert len(rows) == 2
+        scopes = {str(r[1]) for r in rows}
+        assert scopes == {str(agent_a["id"]), str(agent_b["id"])}
+
+    def test_empty_header_treated_as_absent(self, app, pg_conn):
+        from sqlalchemy import text
+
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-empty", token="tk-empty")
+        apply_mock = _apply_async_mock()
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-empty", method="POST",
+            json={"event": "x"},
+            headers={"Idempotency-Key": ""},
+        ):
+            listener = AgentWebhookListener()
+            response = listener.post(
+                webhook_token="tk-empty",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert response.status_code == 200
+        assert apply_mock.call_count == 1
+        count = pg_conn.execute(
+            text("SELECT count(*) FROM webhook_dedup")
+        ).scalar()
+        assert count == 0
+
+    def test_oversized_header_rejected_with_400(self, app, pg_conn):
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-big", token="tk-big")
+        oversized = "x" * 257
+
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+        ) as mock_apply, app.test_request_context(
+            "/api/webhooks/agents/tk-big", method="POST",
+            json={"event": "x"},
+            headers={"Idempotency-Key": oversized},
+        ):
+            listener = AgentWebhookListener()
+            response = listener.post(
+                webhook_token="tk-big",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert response.status_code == 400
+        assert mock_apply.call_count == 0
+
+    def test_stale_dedup_row_does_not_block_new_work(self, app, pg_conn):
+        """Regression for the TTL fail-shut bug: a >24h-old dedup row
+        must not silently drop a new request. Pre-fix, the second POST
+        returned ``task_id="deduplicated"`` and never enqueued.
+        """
+        from sqlalchemy import text
+
+        from application.api.user.agents.webhooks import AgentWebhookListener
+
+        agent = _seed_agent(pg_conn, user="u-stale", token="tk-stale")
+        apply_mock = _apply_async_mock()
+
+        # First POST creates a dedup row.
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-stale", method="POST",
+            json={"event": "x"},
+            headers={"Idempotency-Key": "stale-key"},
+        ):
+            listener = AgentWebhookListener()
+            first = listener.post(
+                webhook_token="tk-stale",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert first.status_code == 200
+        first_task_id = first.json["task_id"]
+        assert first_task_id != "deduplicated"
+
+        # Backdate the row so it looks 25h old.
+        scoped_key = f"{agent['id']}:stale-key"
+        pg_conn.execute(
+            text(
+                "UPDATE webhook_dedup SET created_at = "
+                "clock_timestamp() - make_interval(hours => 25) "
+                "WHERE idempotency_key = :k"
+            ),
+            {"k": scoped_key},
+        )
+
+        # Second POST with the same key must enqueue again, not silently dedup.
+        with _patch_db(pg_conn), patch(
+            "application.api.user.agents.webhooks.process_agent_webhook.apply_async",
+            apply_mock,
+        ), app.test_request_context(
+            "/api/webhooks/agents/tk-stale", method="POST",
+            json={"event": "x2"},
+            headers={"Idempotency-Key": "stale-key"},
+        ):
+            listener = AgentWebhookListener()
+            second = listener.post(
+                webhook_token="tk-stale",
+                agent=agent,
+                agent_id_str=str(agent["id"]),
+            )
+        assert second.status_code == 200
+        assert second.json["task_id"] != "deduplicated"
+        assert second.json["task_id"] != first_task_id
+        assert apply_mock.call_count == 2
--- a/tests/api/user/sources/test_upload.py
+++ b/tests/api/user/sources/test_upload.py
@@ -107,7 +107,7 @@ class TestUploadFile:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.sources.upload.ingest.delay",
+            "application.api.user.sources.upload.ingest.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/upload", method="POST",
@@ -164,7 +164,7 @@ class TestUploadFile:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.sources.upload.ingest.delay",
+            "application.api.user.sources.upload.ingest.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/upload", method="POST",
@@ -197,7 +197,7 @@ class TestUploadFile:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.sources.upload.ingest.delay",
+            "application.api.user.sources.upload.ingest.apply_async",
            return_value=MagicMock(id="t"),
        ), app.test_request_context(
            "/api/upload", method="POST",
@@ -274,7 +274,7 @@ class TestUploadRemote:

        fake_task = MagicMock(id="remote-task-1")
        with patch(
-            "application.api.user.sources.upload.ingest_remote.delay",
+            "application.api.user.sources.upload.ingest_remote.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/remote", method="POST",
@@ -295,7 +295,7 @@ class TestUploadRemote:

        fake_task = MagicMock(id="url-task")
        with patch(
-            "application.api.user.sources.upload.ingest_remote.delay",
+            "application.api.user.sources.upload.ingest_remote.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/remote", method="POST",
@@ -315,7 +315,7 @@ class TestUploadRemote:

        fake_task = MagicMock(id="reddit-task")
        with patch(
-            "application.api.user.sources.upload.ingest_remote.delay",
+            "application.api.user.sources.upload.ingest_remote.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/remote", method="POST",
@@ -334,7 +334,7 @@ class TestUploadRemote:
        from application.api.user.sources.upload import UploadRemote

        with patch(
-            "application.api.user.sources.upload.ingest_remote.delay",
+            "application.api.user.sources.upload.ingest_remote.apply_async",
            side_effect=RuntimeError("boom"),
        ), app.test_request_context(
            "/api/remote", method="POST",
@@ -460,7 +460,7 @@ class TestManageSourceFiles:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.tasks.reingest_source_task.delay",
+            "application.api.user.tasks.reingest_source_task.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/manage_source_files", method="POST",
@@ -560,7 +560,7 @@ class TestManageSourceFiles:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.tasks.reingest_source_task.delay",
+            "application.api.user.tasks.reingest_source_task.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/manage_source_files", method="POST",
@@ -666,7 +666,7 @@ class TestManageSourceFiles:
            "application.api.user.sources.upload.StorageCreator.get_storage",
            return_value=fake_storage,
        ), patch(
-            "application.api.user.tasks.reingest_source_task.delay",
+            "application.api.user.tasks.reingest_source_task.apply_async",
            return_value=fake_task,
        ), app.test_request_context(
            "/api/manage_source_files", method="POST",
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ManishMadan2882	8b9eb5cffe	(feat:search_conversations) highlight matching snippet + frontend	2026-05-14 01:39:35 +05:30
ManishMadan2882	1a764c6ee8	(feat:search_conversations) add route	2026-05-14 01:39:35 +05:30
Manish Madan	4d6f360e3a	Throttle + debounce (#2458 ) * (feat:apiClient) replace direct fetch in fe * (feat)fe: throttle api client * (feat)fe: debounce hooks	2026-05-11 23:05:16 +01:00
Mustafa Sayyed	e245057822	fix: update message and rename conversation triggers (#2439 ) * fix: update message and rename conversation triggers * fix trim comparison * fix ts error * (chore)fe: linter --------- Co-authored-by: ManishMadan2882 <manishmadan321@gmail.com>	2026-05-11 11:08:19 +01:00
Alex	e692c645b9	fix: pgvec	2026-05-05 01:55:23 +01:00
Alex	b4c4ab68f0	feat: durability and idempotency keys (#2450 ) * feat: durability and idempotency keys * feat: more durable frontend * fix: tests * fix: mini issues * fix: better json validation * fix: tests	2026-05-04 23:25:41 +01:00