fix(events): skip replay budget INCR when no snapshot work possible

_allow_replay incremented the per-user counter on every
/api/events GET, including no-op connects from a fresh client
with no cursor against an empty backlog. React StrictMode dev
double-mounts plus a few tabs trivially tripped the default
30-per-60s budget on idle reconnects.

XLEN pre-check: when last_event_id is None and the user stream
is empty, the connect can't do snapshot work — return True
without INCR. Cursor-bearing connects still INCR unconditionally
(probing the cursor's relationship to stream contents would
require a redundant XRANGE).
This commit is contained in:
Alex
2026-05-12 17:55:08 +01:00
parent 4c8230fb6c
commit cb3ca8a36b
2 changed files with 78 additions and 9 deletions

View File

@@ -120,7 +120,9 @@ def _oldest_retained_id(redis_client, user_id: str) -> Optional[str]:
return None
def _allow_replay(redis_client, user_id: str) -> bool:
def _allow_replay(
redis_client, user_id: str, last_event_id: Optional[str]
) -> bool:
"""Per-user sliding-window snapshot-replay budget.
Increments a Redis counter under
@@ -129,12 +131,39 @@ def _allow_replay(redis_client, user_id: str) -> bool:
Redis is unavailable, fails open — the existing per-user
concurrency cap still bounds parallel enumeration. Returns
``True`` when the budget setting is 0 (disabled) or non-positive.
Budget is only consumed when the connect can plausibly do snapshot
work. A fresh client (``last_event_id is None``) connecting to an
empty backlog (``XLEN == 0``) returns ``True`` without INCR'ing —
this catches the React-StrictMode dev-burst case where double
mounts of an empty-backlog user would otherwise trip 429 in 5
seconds and lock out further connects until the window rolls.
"""
budget = int(settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW)
if budget <= 0:
return True
if redis_client is None:
return True
# Cheap pre-check: only INCR when we might actually replay. XLEN
# is one Redis op; the alternative (INCR every connect) is two
# ops AND wrongly counts no-op probes. The check is conservative:
# if ``last_event_id`` is set we always INCR, even if the cursor
# has already overtaken the latest entry — that case is rare and
# short-lived, and probing further would mean a redundant XRANGE.
if last_event_id is None:
try:
if int(redis_client.xlen(stream_key(user_id))) == 0:
return True
except Exception:
# XLEN probe failed; fall through to the INCR path so a
# transient Redis hiccup can't bypass the budget.
logger.debug(
"XLEN probe failed for replay budget check user=%s; "
"proceeding to INCR",
user_id,
)
window = max(1, int(settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS))
key = replay_budget_key(user_id)
try:
@@ -326,7 +355,7 @@ def stream_events() -> Response:
# cursor pinned and lets the frontend back off until the window
# slides (eventStreamClient.ts treats 429 as escalated backoff).
if push_enabled and redis_client is not None and not _allow_replay(
redis_client, user_id
redis_client, user_id, last_event_id
):
if counted:
try:

View File

@@ -440,7 +440,7 @@ class TestReplayRateLimit:
with patch("application.api.events.routes.settings") as mock_settings:
mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 0
mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60
assert _allow_replay(MagicMock(), "alice") is True
assert _allow_replay(MagicMock(), "alice", "1735682400000-0") is True
def test_allow_replay_returns_true_when_redis_unavailable(self):
from application.api.events.routes import _allow_replay
@@ -448,7 +448,45 @@ class TestReplayRateLimit:
with patch("application.api.events.routes.settings") as mock_settings:
mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 5
mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60
assert _allow_replay(None, "alice") is True
assert _allow_replay(None, "alice", "1735682400000-0") is True
def test_allow_replay_skips_incr_when_no_cursor_and_empty_backlog(self):
"""Fresh client with no cursor and an empty user stream cannot
do snapshot work — INCR'ing the counter would needlessly
burn budget. Catches the React-StrictMode dev-burst case where
double-mounted components would otherwise 429 in 5 connects.
"""
from application.api.events.routes import _allow_replay
with patch("application.api.events.routes.settings") as mock_settings:
mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 3
mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60
redis = MagicMock()
redis.xlen.return_value = 0
# 5 connects in a row, all with no cursor — none consume
# budget because the backlog is empty.
for _ in range(5):
assert _allow_replay(redis, "alice", None) is True
redis.xlen.assert_called()
redis.incr.assert_not_called()
def test_allow_replay_incrs_when_no_cursor_but_backlog_present(self):
"""A no-cursor connect against a non-empty backlog *will* do
snapshot work, so it consumes budget normally.
"""
from application.api.events.routes import _allow_replay
with patch("application.api.events.routes.settings") as mock_settings:
mock_settings.EVENTS_REPLAY_BUDGET_REQUESTS_PER_WINDOW = 5
mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60
redis = MagicMock()
redis.xlen.return_value = 42
redis.incr.return_value = 1
assert _allow_replay(redis, "alice", None) is True
redis.incr.assert_called_once()
def test_allow_replay_passes_until_budget_exhausted(self):
from application.api.events.routes import _allow_replay
@@ -465,12 +503,14 @@ class TestReplayRateLimit:
redis.incr.side_effect = _incr
# Cursor set → XLEN short-circuit doesn't fire, INCR always runs.
cursor = "1735682400000-0"
# First three pass.
assert _allow_replay(redis, "alice") is True
assert _allow_replay(redis, "alice") is True
assert _allow_replay(redis, "alice") is True
assert _allow_replay(redis, "alice", cursor) is True
assert _allow_replay(redis, "alice", cursor) is True
assert _allow_replay(redis, "alice", cursor) is True
# Fourth refused.
assert _allow_replay(redis, "alice") is False
assert _allow_replay(redis, "alice", cursor) is False
# TTL only seeded on the first INCR (when count == 1).
redis.expire.assert_called_once()
@@ -482,7 +522,7 @@ class TestReplayRateLimit:
mock_settings.EVENTS_REPLAY_BUDGET_WINDOW_SECONDS = 60
redis = MagicMock()
redis.incr.side_effect = Exception("redis down")
assert _allow_replay(redis, "alice") is True
assert _allow_replay(redis, "alice", "1735682400000-0") is True
def test_replay_backlog_passes_count_to_xrange(self):
from application.api.events.routes import _replay_backlog