* feat: implement URL validation to prevent SSRF

* feat: add zip extraction security

* ruff fixes
This commit is contained in:
Alex
2025-12-24 15:05:35 +00:00
committed by GitHub
parent 83e7a928f1
commit 98e949d2fd
11 changed files with 929 additions and 27 deletions

View File

@@ -1,5 +1,6 @@
from types import SimpleNamespace
from unittest.mock import MagicMock
from urllib.parse import urlparse
import pytest
import requests
@@ -29,6 +30,21 @@ def _fake_extract(value: str) -> SimpleNamespace:
return SimpleNamespace(domain=domain, suffix=suffix)
def _mock_validate_url(url):
"""Mock validate_url that allows test URLs through."""
if not urlparse(url).scheme:
url = "http://" + url
return url
@pytest.fixture(autouse=True)
def _patch_validate_url(monkeypatch):
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.validate_url",
_mock_validate_url,
)
@pytest.fixture(autouse=True)
def _patch_tldextract(monkeypatch):
monkeypatch.setattr(
@@ -112,7 +128,7 @@ def test_load_data_allows_subdomains(_patch_markdownify):
assert len(docs) == 2
def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify):
def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify, _patch_validate_url):
root_html = """
<html><head><title>Home</title></head>
<body><a href="/about">About</a></body>
@@ -137,3 +153,21 @@ def test_load_data_handles_fetch_errors(monkeypatch, _patch_markdownify):
assert docs[0].text == "Home Markdown"
assert mock_print.called
def test_load_data_returns_empty_on_ssrf_validation_failure(monkeypatch):
"""Test that SSRF validation failure returns empty list."""
from application.core.url_validation import SSRFError
def raise_ssrf_error(url):
raise SSRFError("Access to private IP not allowed")
monkeypatch.setattr(
"application.parser.remote.crawler_markdown.validate_url",
raise_ssrf_error,
)
loader = CrawlerLoader()
result = loader.load_data("http://192.168.1.1")
assert result == []