mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-01-20 14:00:55 +00:00
feat: improve crawlers and update chunk filtering (#2250)
This commit is contained in:
@@ -55,9 +55,14 @@ class GetChunks(Resource):
|
||||
|
||||
if path:
|
||||
chunk_source = metadata.get("source", "")
|
||||
# Check if the chunk's source matches the requested path
|
||||
chunk_file_path = metadata.get("file_path", "")
|
||||
# Check if the chunk matches the requested path
|
||||
# For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf")
|
||||
# For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md")
|
||||
source_match = chunk_source and chunk_source.endswith(path)
|
||||
file_path_match = chunk_file_path and chunk_file_path.endswith(path)
|
||||
|
||||
if not chunk_source or not chunk_source.endswith(path):
|
||||
if not (source_match or file_path_match):
|
||||
continue
|
||||
# Filter by search term if provided
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
|
||||
docs = loader.load()
|
||||
# Convert the loaded documents to your Document schema
|
||||
for doc in docs:
|
||||
metadata = dict(doc.metadata or {})
|
||||
source_url = metadata.get("source") or current_url
|
||||
metadata["file_path"] = self._url_to_virtual_path(source_url)
|
||||
loaded_content.append(
|
||||
Document(
|
||||
doc.page_content,
|
||||
extra_info=doc.metadata
|
||||
extra_info=metadata
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
|
||||
break
|
||||
|
||||
return loaded_content
|
||||
|
||||
def _url_to_virtual_path(self, url):
|
||||
"""
|
||||
Convert a URL to a virtual file path ending with .md.
|
||||
|
||||
Examples:
|
||||
https://docs.docsgpt.cloud/ -> index.md
|
||||
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
|
||||
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
|
||||
https://example.com/page.html -> page.md
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.strip("/")
|
||||
|
||||
if not path:
|
||||
return "index.md"
|
||||
|
||||
# Remove common file extensions and add .md
|
||||
base, ext = os.path.splitext(path)
|
||||
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
|
||||
path = base
|
||||
|
||||
if not path.endswith(".md"):
|
||||
path = f"{path}.md"
|
||||
|
||||
return path
|
||||
|
||||
@@ -7,6 +7,7 @@ import re
|
||||
from markdownify import markdownify
|
||||
from application.parser.schema.base import Document
|
||||
import tldextract
|
||||
import os
|
||||
|
||||
class CrawlerLoader(BaseRemote):
|
||||
def __init__(self, limit=10, allow_subdomains=False):
|
||||
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
|
||||
# Convert the HTML to Markdown for cleaner text formatting
|
||||
title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
|
||||
if processed_markdown:
|
||||
# Generate virtual file path from URL for consistent file-like matching
|
||||
virtual_path = self._url_to_virtual_path(current_url)
|
||||
|
||||
# Create a Document for each visited page
|
||||
documents.append(
|
||||
Document(
|
||||
processed_markdown, # content
|
||||
None, # doc_id
|
||||
None, # embedding
|
||||
{"source": current_url, "title": title, "language": language} # extra_info
|
||||
{
|
||||
"source": current_url,
|
||||
"title": title,
|
||||
"language": language,
|
||||
"file_path": virtual_path,
|
||||
}, # extra_info
|
||||
)
|
||||
)
|
||||
|
||||
@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
|
||||
# Exact domain match
|
||||
if link_base == base_domain:
|
||||
filtered.append(link)
|
||||
return filtered
|
||||
return filtered
|
||||
|
||||
def _url_to_virtual_path(self, url):
|
||||
"""
|
||||
Convert a URL to a virtual file path ending with .md.
|
||||
|
||||
Examples:
|
||||
https://docs.docsgpt.cloud/ -> index.md
|
||||
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
|
||||
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
|
||||
https://example.com/page.html -> page.md
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.strip("/")
|
||||
|
||||
if not path:
|
||||
return "index.md"
|
||||
|
||||
# Remove common file extensions and add .md
|
||||
base, ext = os.path.splitext(path)
|
||||
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
|
||||
path = base
|
||||
|
||||
# Ensure path ends with .md
|
||||
if not path.endswith(".md"):
|
||||
path = path + ".md"
|
||||
|
||||
return path
|
||||
@@ -869,27 +869,33 @@ def remote_worker(
|
||||
logging.info("Total tokens calculated: %d", tokens)
|
||||
|
||||
# Build directory structure from loaded documents
|
||||
# Format matches local file uploads: flat structure with type, size_bytes, token_count
|
||||
# Format matches local file uploads: nested structure with type, size_bytes, token_count
|
||||
directory_structure = {}
|
||||
for doc in raw_docs:
|
||||
# Get the file path/name from doc_id or extra_info
|
||||
file_path = doc.doc_id or ""
|
||||
if not file_path and doc.extra_info:
|
||||
file_path = doc.extra_info.get("key", "") or doc.extra_info.get(
|
||||
"title", ""
|
||||
# Get the file path from extra_info
|
||||
# For crawlers: file_path is a virtual path like "guides/setup.md"
|
||||
# For other remotes: use key or title as fallback
|
||||
file_path = ""
|
||||
if doc.extra_info:
|
||||
file_path = (
|
||||
doc.extra_info.get("file_path", "")
|
||||
or doc.extra_info.get("key", "")
|
||||
or doc.extra_info.get("title", "")
|
||||
)
|
||||
if not file_path:
|
||||
file_path = doc.doc_id or ""
|
||||
|
||||
if file_path:
|
||||
# Use just the filename (last part of path) for flat structure
|
||||
file_name = file_path.split("/")[-1] if "/" in file_path else file_path
|
||||
|
||||
# Calculate token count
|
||||
token_count = len(doc.text.split()) if doc.text else 0
|
||||
token_count = num_tokens_from_string(doc.text) if doc.text else 0
|
||||
|
||||
# Estimate size in bytes from text content
|
||||
size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0
|
||||
|
||||
# Guess mime type from extension
|
||||
file_name = (
|
||||
file_path.split("/")[-1] if "/" in file_path else file_path
|
||||
)
|
||||
ext = os.path.splitext(file_name)[1].lower()
|
||||
mime_types = {
|
||||
".txt": "text/plain",
|
||||
@@ -909,11 +915,23 @@ def remote_worker(
|
||||
}
|
||||
file_type = mime_types.get(ext, "application/octet-stream")
|
||||
|
||||
directory_structure[file_name] = {
|
||||
"type": file_type,
|
||||
"size_bytes": size_bytes,
|
||||
"token_count": token_count,
|
||||
}
|
||||
# Build nested directory structure from path
|
||||
# e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
|
||||
path_parts = file_path.split("/")
|
||||
current_level = directory_structure
|
||||
for i, part in enumerate(path_parts):
|
||||
if i == len(path_parts) - 1:
|
||||
# Last part is the file
|
||||
current_level[part] = {
|
||||
"type": file_type,
|
||||
"size_bytes": size_bytes,
|
||||
"token_count": token_count,
|
||||
}
|
||||
else:
|
||||
# Intermediate parts are directories
|
||||
if part not in current_level:
|
||||
current_level[part] = {}
|
||||
current_level = current_level[part]
|
||||
|
||||
logging.info(
|
||||
f"Built directory structure with {len(directory_structure)} files: "
|
||||
|
||||
@@ -78,6 +78,9 @@ def test_load_data_crawls_same_domain_links(mock_requests_get, mock_validate_url
|
||||
sources = {doc.extra_info.get("source") for doc in result}
|
||||
assert sources == {"http://example.com", "http://example.com/about"}
|
||||
|
||||
paths = {doc.extra_info.get("file_path") for doc in result}
|
||||
assert paths == {"index.md", "about.md"}
|
||||
|
||||
texts = {doc.text for doc in result}
|
||||
assert texts == {"Root content", "About content"}
|
||||
|
||||
@@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "Homepage"
|
||||
assert result[0].extra_info == {"source": "http://example.com"}
|
||||
assert result[0].extra_info == {
|
||||
"source": "http://example.com",
|
||||
"file_path": "index.md",
|
||||
}
|
||||
|
||||
|
||||
@patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url)
|
||||
@@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url):
|
||||
assert result == []
|
||||
mock_validate_url.assert_called_once()
|
||||
|
||||
|
||||
def test_url_to_virtual_path_variants():
|
||||
crawler = CrawlerLoader()
|
||||
|
||||
assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md"
|
||||
assert (
|
||||
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup")
|
||||
== "guides/setup.md"
|
||||
)
|
||||
assert (
|
||||
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/")
|
||||
== "guides/setup.md"
|
||||
)
|
||||
assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"
|
||||
|
||||
Reference in New Issue
Block a user