mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-02-21 03:41:02 +00:00
Patches (#2218)
* feat: implement URL validation to prevent SSRF * feat: add zip extraction security * ruff fixes
This commit is contained in:
@@ -4,6 +4,7 @@ from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
from application.parser.remote.base import BaseRemote
|
||||
from application.parser.schema.base import Document
|
||||
from application.core.url_validation import validate_url, SSRFError
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
|
||||
class CrawlerLoader(BaseRemote):
|
||||
@@ -16,9 +17,12 @@ class CrawlerLoader(BaseRemote):
|
||||
if isinstance(url, list) and url:
|
||||
url = url[0]
|
||||
|
||||
# Check if the URL scheme is provided, if not, assume http
|
||||
if not urlparse(url).scheme:
|
||||
url = "http://" + url
|
||||
# Validate URL to prevent SSRF attacks
|
||||
try:
|
||||
url = validate_url(url)
|
||||
except SSRFError as e:
|
||||
logging.error(f"URL validation failed: {e}")
|
||||
return []
|
||||
|
||||
visited_urls = set()
|
||||
base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
|
||||
@@ -30,7 +34,14 @@ class CrawlerLoader(BaseRemote):
|
||||
visited_urls.add(current_url)
|
||||
|
||||
try:
|
||||
response = requests.get(current_url)
|
||||
# Validate each URL before making requests
|
||||
try:
|
||||
validate_url(current_url)
|
||||
except SSRFError as e:
|
||||
logging.warning(f"Skipping URL due to validation failure: {current_url} - {e}")
|
||||
continue
|
||||
|
||||
response = requests.get(current_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
loader = self.loader([current_url])
|
||||
docs = loader.load()
|
||||
|
||||
Reference in New Issue
Block a user