fix: mini fixes

This commit is contained in:
Alex
2026-05-17 23:30:16 +01:00
parent 2399aff245
commit 2ae0ef6cc0
3 changed files with 42 additions and 2 deletions

View File

@@ -52,8 +52,9 @@ def normalize_remote_data(source_type, remote_data):
remote_data: The stored ``remote_data`` (dict, list, str, or None).
Returns:
``source_data`` for the loader: a URL string for url/crawler/
sitemap/github, a JSON string for reddit, a dict for s3.
Loader input: a URL string or list for url/crawler/sitemap/github,
a JSON string for reddit, a dict for s3; ``None`` when the row has
nothing syncable.
"""
if remote_data is None:
return None
@@ -65,6 +66,8 @@ def normalize_remote_data(source_type, remote_data):
try:
remote_data = json.loads(stripped)
except json.JSONDecodeError:
# Not actually JSON — leave remote_data as the original
# string; the per-loader branches below handle a string.
pass
loader = (source_type or "").lower()

View File

@@ -1446,6 +1446,11 @@ def sync_worker(self, frequency):
continue
source_data = normalize_remote_data(source_type, doc.get("remote_data"))
if not source_data:
# No syncable URL/config — skip instead of dispatching a sync
# that can only fail (and emit a spurious failed event).
sync_counts["sync_skipped"] += 1
continue
resp = sync(
self, source_data, name, user, source_type, frequency, retriever, doc_id

View File

@@ -240,6 +240,38 @@ class TestSyncWorker:
"loader must receive the URL string, not the remote_data dict"
)
def test_unsyncable_remote_data_is_skipped(
self,
pg_conn,
patch_worker_db,
task_self,
monkeypatch,
):
"""A URL source whose remote_data dict has no URL key normalizes
to None — sync_worker must skip it, not dispatch a doomed sync()."""
from application import worker
SourcesRepository(pg_conn).create(
"broken-feed",
user_id="frank",
type="url",
retriever="classic",
sync_frequency="monthly",
remote_data={"provider": "url"},
)
def _must_not_run(*args, **kwargs):
raise AssertionError("sync() must not run for unsyncable sources")
monkeypatch.setattr(worker, "sync", _must_not_run)
result = worker.sync_worker(task_self, "monthly")
assert result["total_sync_count"] == 1
assert result["sync_skipped"] == 1
assert result["sync_failure"] == 0
assert result["sync_success"] == 0
@pytest.mark.unit
class TestRemoteWorkerPathTraversal: