Patches (#2218)

* feat: implement URL validation to prevent SSRF * feat: add zip extraction security * ruff fixes
2026-02-21 03:41:02 +00:00 · 2025-12-24 15:05:35 +00:00
parent 83e7a928f1
commit 98e949d2fd
11 changed files with 929 additions and 27 deletions
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -4,6 +4,7 @@ from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
 from application.parser.remote.base import BaseRemote
 from application.parser.schema.base import Document
+from application.core.url_validation import validate_url, SSRFError
 from langchain_community.document_loaders import WebBaseLoader

 class CrawlerLoader(BaseRemote):
@@ -16,9 +17,12 @@ class CrawlerLoader(BaseRemote):
        if isinstance(url, list) and url:
            url = url[0]

-        # Check if the URL scheme is provided, if not, assume http
-        if not urlparse(url).scheme:
-            url = "http://" + url
+        # Validate URL to prevent SSRF attacks
+        try:
+            url = validate_url(url)
+        except SSRFError as e:
+            logging.error(f"URL validation failed: {e}")
+            return []

        visited_urls = set()
        base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
@@ -30,7 +34,14 @@ class CrawlerLoader(BaseRemote):
            visited_urls.add(current_url)

            try:
-                response = requests.get(current_url)
+                # Validate each URL before making requests
+                try:
+                    validate_url(current_url)
+                except SSRFError as e:
+                    logging.warning(f"Skipping URL due to validation failure: {current_url} - {e}")
+                    continue
+
+                response = requests.get(current_url, timeout=30)
                response.raise_for_status()
                loader = self.loader([current_url])
                docs = loader.load()