From 2ae0ef6cc02ae65d1f678a97154c7ad160b749d8 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 17 May 2026 23:30:16 +0100 Subject: [PATCH] fix: mini fixes --- application/parser/remote/remote_creator.py | 7 +++-- application/worker.py | 5 ++++ tests/worker/test_remote_worker.py | 32 +++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index b127dcf9..9ff80b3d 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -52,8 +52,9 @@ def normalize_remote_data(source_type, remote_data): remote_data: The stored ``remote_data`` (dict, list, str, or None). Returns: - ``source_data`` for the loader: a URL string for url/crawler/ - sitemap/github, a JSON string for reddit, a dict for s3. + Loader input: a URL string or list for url/crawler/sitemap/github, + a JSON string for reddit, a dict for s3; ``None`` when the row has + nothing syncable. """ if remote_data is None: return None @@ -65,6 +66,8 @@ def normalize_remote_data(source_type, remote_data): try: remote_data = json.loads(stripped) except json.JSONDecodeError: + # Not actually JSON — leave remote_data as the original + # string; the per-loader branches below handle a string. pass loader = (source_type or "").lower() diff --git a/application/worker.py b/application/worker.py index c64d00a1..97113882 100755 --- a/application/worker.py +++ b/application/worker.py @@ -1446,6 +1446,11 @@ def sync_worker(self, frequency): continue source_data = normalize_remote_data(source_type, doc.get("remote_data")) + if not source_data: + # No syncable URL/config — skip instead of dispatching a sync + # that can only fail (and emit a spurious failed event). + sync_counts["sync_skipped"] += 1 + continue resp = sync( self, source_data, name, user, source_type, frequency, retriever, doc_id diff --git a/tests/worker/test_remote_worker.py b/tests/worker/test_remote_worker.py index 69fdbd60..c00c831d 100644 --- a/tests/worker/test_remote_worker.py +++ b/tests/worker/test_remote_worker.py @@ -240,6 +240,38 @@ class TestSyncWorker: "loader must receive the URL string, not the remote_data dict" ) + def test_unsyncable_remote_data_is_skipped( + self, + pg_conn, + patch_worker_db, + task_self, + monkeypatch, + ): + """A URL source whose remote_data dict has no URL key normalizes + to None — sync_worker must skip it, not dispatch a doomed sync().""" + from application import worker + + SourcesRepository(pg_conn).create( + "broken-feed", + user_id="frank", + type="url", + retriever="classic", + sync_frequency="monthly", + remote_data={"provider": "url"}, + ) + + def _must_not_run(*args, **kwargs): + raise AssertionError("sync() must not run for unsyncable sources") + + monkeypatch.setattr(worker, "sync", _must_not_run) + + result = worker.sync_worker(task_self, "monthly") + + assert result["total_sync_count"] == 1 + assert result["sync_skipped"] == 1 + assert result["sync_failure"] == 0 + assert result["sync_success"] == 0 + @pytest.mark.unit class TestRemoteWorkerPathTraversal: