From cb3ca8a36b38e68afe60b334d4df19e2d41394a5 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 12 May 2026 17:55:08 +0100 Subject: [PATCH] fix(events): skip replay budget INCR when no snapshot work possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _allow_replay incremented the per-user counter on every /api/events GET, including no-op connects from a fresh client with no cursor against an empty backlog. React StrictMode dev double-mounts plus a few tabs trivially tripped the default 30-per-60s budget on idle reconnects. XLEN pre-check: when last_event_id is None and the user stream is empty, the connect can't do snapshot work — return True without INCR. Cursor-bearing connects still INCR unconditionally (probing the cursor's relationship to stream contents would require a redundant XRANGE). --- application/api/events/routes.py | 33 +++++++++++++++++-- tests/api/test_events_routes.py | 54 +++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/application/api/events/routes.py b/application/api/events/routes.py index ca50519f..7e13a8ab 100644 --- a/application/api/events/routes.py +++ b/application/api/events/routes.py @@ -120,7 +120,9 @@ def _oldest_retained_id(redis_client, user_id: str) -> Optional[str]: return None -def _allow_replay(redis_client, user_id: str) -> bool: +def _allow_replay( + redis_client, user_id: str, last_event_id: Optional[str] +) -> bool: """Per-user sliding-window snapshot-replay budget. Increments a Redis counter under @@ -129,12 +131,39 @@ def _allow_replay(redis_client, user_id: str) -> bool: Redis is unavailable, fails open — the existing per-user concurrency cap still bounds parallel enumeration. Returns ``True`` when the budget setting is 0 (disabled) or non-positive. + + Budget is only consumed when the connect can plausibly do snapshot + work. A fresh client (``last_event_id is None``) connecting to an + empty backlog (``XLEN == 0``) returns ``True`` without INCR'ing — + this catches the React-StrictMode dev-burst case where double + mounts of an empty-backlog user would otherwise trip 429 in 5 + seconds and lock out further connects until the window rolls. """ budget = int(settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW) if budget <= 0: return True if redis_client is None: return True + + # Cheap pre-check: only INCR when we might actually replay. XLEN + # is one Redis op; the alternative (INCR every connect) is two + # ops AND wrongly counts no-op probes. The check is conservative: + # if ``last_event_id`` is set we always INCR, even if the cursor + # has already overtaken the latest entry — that case is rare and + # short-lived, and probing further would mean a redundant XRANGE. + if last_event_id is None: + try: + if int(redis_client.xlen(stream_key(user_id))) == 0: + return True + except Exception: + # XLEN probe failed; fall through to the INCR path so a + # transient Redis hiccup can't bypass the budget. + logger.debug( + "XLEN probe failed for replay budget check user=%s; " + "proceeding to INCR", + user_id, + ) + window = max(1, int(settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS)) key = replay_budget_key(user_id) try: @@ -326,7 +355,7 @@ def stream_events() -> Response: # cursor pinned and lets the frontend back off until the window # slides (eventStreamClient.ts treats 429 as escalated backoff). if push_enabled and redis_client is not None and not _allow_replay( - redis_client, user_id + redis_client, user_id, last_event_id ): if counted: try: diff --git a/tests/api/test_events_routes.py b/tests/api/test_events_routes.py index 43ac5067..7ea3d7ba 100644 --- a/tests/api/test_events_routes.py +++ b/tests/api/test_events_routes.py @@ -440,7 +440,7 @@ class TestReplayRateLimit: with patch("application.api.events.routes.settings") as mock_settings: mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 0 mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60 - assert _allow_replay(MagicMock(), "alice") is True + assert _allow_replay(MagicMock(), "alice", "1735682400000-0") is True def test_allow_replay_returns_true_when_redis_unavailable(self): from application.api.events.routes import _allow_replay @@ -448,7 +448,45 @@ class TestReplayRateLimit: with patch("application.api.events.routes.settings") as mock_settings: mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 5 mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60 - assert _allow_replay(None, "alice") is True + assert _allow_replay(None, "alice", "1735682400000-0") is True + + def test_allow_replay_skips_incr_when_no_cursor_and_empty_backlog(self): + """Fresh client with no cursor and an empty user stream cannot + do snapshot work — INCR'ing the counter would needlessly + burn budget. Catches the React-StrictMode dev-burst case where + double-mounted components would otherwise 429 in 5 connects. + """ + from application.api.events.routes import _allow_replay + + with patch("application.api.events.routes.settings") as mock_settings: + mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 3 + mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60 + redis = MagicMock() + redis.xlen.return_value = 0 + + # 5 connects in a row, all with no cursor — none consume + # budget because the backlog is empty. + for _ in range(5): + assert _allow_replay(redis, "alice", None) is True + + redis.xlen.assert_called() + redis.incr.assert_not_called() + + def test_allow_replay_incrs_when_no_cursor_but_backlog_present(self): + """A no-cursor connect against a non-empty backlog *will* do + snapshot work, so it consumes budget normally. + """ + from application.api.events.routes import _allow_replay + + with patch("application.api.events.routes.settings") as mock_settings: + mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 5 + mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60 + redis = MagicMock() + redis.xlen.return_value = 42 + redis.incr.return_value = 1 + + assert _allow_replay(redis, "alice", None) is True + redis.incr.assert_called_once() def test_allow_replay_passes_until_budget_exhausted(self): from application.api.events.routes import _allow_replay @@ -465,12 +503,14 @@ class TestReplayRateLimit: redis.incr.side_effect = _incr + # Cursor set → XLEN short-circuit doesn't fire, INCR always runs. + cursor = "1735682400000-0" # First three pass. - assert _allow_replay(redis, "alice") is True - assert _allow_replay(redis, "alice") is True - assert _allow_replay(redis, "alice") is True + assert _allow_replay(redis, "alice", cursor) is True + assert _allow_replay(redis, "alice", cursor) is True + assert _allow_replay(redis, "alice", cursor) is True # Fourth refused. - assert _allow_replay(redis, "alice") is False + assert _allow_replay(redis, "alice", cursor) is False # TTL only seeded on the first INCR (when count == 1). redis.expire.assert_called_once() @@ -482,7 +522,7 @@ class TestReplayRateLimit: mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60 redis = MagicMock() redis.incr.side_effect = Exception("redis down") - assert _allow_replay(redis, "alice") is True + assert _allow_replay(redis, "alice", "1735682400000-0") is True def test_replay_backlog_passes_count_to_xrange(self): from application.api.events.routes import _replay_backlog