feat: improve crawlers and update chunk filtering (#2250)

This commit is contained in:
Alex
2026-01-05 22:52:12 +00:00
committed by GitHub
parent 5662be12b5
commit df57053613
5 changed files with 130 additions and 21 deletions

View File

@@ -55,9 +55,14 @@ class GetChunks(Resource):
if path:
chunk_source = metadata.get("source", "")
# Check if the chunk's source matches the requested path
chunk_file_path = metadata.get("file_path", "")
# Check if the chunk matches the requested path
# For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf")
# For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md")
source_match = chunk_source and chunk_source.endswith(path)
file_path_match = chunk_file_path and chunk_file_path.endswith(path)
if not chunk_source or not chunk_source.endswith(path):
if not (source_match or file_path_match):
continue
# Filter by search term if provided

View File

@@ -1,4 +1,5 @@
import logging
import os
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
docs = loader.load()
# Convert the loaded documents to your Document schema
for doc in docs:
metadata = dict(doc.metadata or {})
source_url = metadata.get("source") or current_url
metadata["file_path"] = self._url_to_virtual_path(source_url)
loaded_content.append(
Document(
doc.page_content,
extra_info=doc.metadata
extra_info=metadata
)
)
except Exception as e:
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
break
return loaded_content
def _url_to_virtual_path(self, url):
"""
Convert a URL to a virtual file path ending with .md.
Examples:
https://docs.docsgpt.cloud/ -> index.md
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
https://example.com/page.html -> page.md
"""
parsed = urlparse(url)
path = parsed.path.strip("/")
if not path:
return "index.md"
# Remove common file extensions and add .md
base, ext = os.path.splitext(path)
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
path = base
if not path.endswith(".md"):
path = f"{path}.md"
return path

View File

@@ -7,6 +7,7 @@ import re
from markdownify import markdownify
from application.parser.schema.base import Document
import tldextract
import os
class CrawlerLoader(BaseRemote):
def __init__(self, limit=10, allow_subdomains=False):
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
# Convert the HTML to Markdown for cleaner text formatting
title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
if processed_markdown:
# Generate virtual file path from URL for consistent file-like matching
virtual_path = self._url_to_virtual_path(current_url)
# Create a Document for each visited page
documents.append(
Document(
processed_markdown, # content
None, # doc_id
None, # embedding
{"source": current_url, "title": title, "language": language} # extra_info
{
"source": current_url,
"title": title,
"language": language,
"file_path": virtual_path,
}, # extra_info
)
)
@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
# Exact domain match
if link_base == base_domain:
filtered.append(link)
return filtered
return filtered
def _url_to_virtual_path(self, url):
"""
Convert a URL to a virtual file path ending with .md.
Examples:
https://docs.docsgpt.cloud/ -> index.md
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
https://example.com/page.html -> page.md
"""
parsed = urlparse(url)
path = parsed.path.strip("/")
if not path:
return "index.md"
# Remove common file extensions and add .md
base, ext = os.path.splitext(path)
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
path = base
# Ensure path ends with .md
if not path.endswith(".md"):
path = path + ".md"
return path

View File

@@ -869,27 +869,33 @@ def remote_worker(
logging.info("Total tokens calculated: %d", tokens)
# Build directory structure from loaded documents
# Format matches local file uploads: flat structure with type, size_bytes, token_count
# Format matches local file uploads: nested structure with type, size_bytes, token_count
directory_structure = {}
for doc in raw_docs:
# Get the file path/name from doc_id or extra_info
file_path = doc.doc_id or ""
if not file_path and doc.extra_info:
file_path = doc.extra_info.get("key", "") or doc.extra_info.get(
"title", ""
# Get the file path from extra_info
# For crawlers: file_path is a virtual path like "guides/setup.md"
# For other remotes: use key or title as fallback
file_path = ""
if doc.extra_info:
file_path = (
doc.extra_info.get("file_path", "")
or doc.extra_info.get("key", "")
or doc.extra_info.get("title", "")
)
if not file_path:
file_path = doc.doc_id or ""
if file_path:
# Use just the filename (last part of path) for flat structure
file_name = file_path.split("/")[-1] if "/" in file_path else file_path
# Calculate token count
token_count = len(doc.text.split()) if doc.text else 0
token_count = num_tokens_from_string(doc.text) if doc.text else 0
# Estimate size in bytes from text content
size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0
# Guess mime type from extension
file_name = (
file_path.split("/")[-1] if "/" in file_path else file_path
)
ext = os.path.splitext(file_name)[1].lower()
mime_types = {
".txt": "text/plain",
@@ -909,11 +915,23 @@ def remote_worker(
}
file_type = mime_types.get(ext, "application/octet-stream")
directory_structure[file_name] = {
"type": file_type,
"size_bytes": size_bytes,
"token_count": token_count,
}
# Build nested directory structure from path
# e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
path_parts = file_path.split("/")
current_level = directory_structure
for i, part in enumerate(path_parts):
if i == len(path_parts) - 1:
# Last part is the file
current_level[part] = {
"type": file_type,
"size_bytes": size_bytes,
"token_count": token_count,
}
else:
# Intermediate parts are directories
if part not in current_level:
current_level[part] = {}
current_level = current_level[part]
logging.info(
f"Built directory structure with {len(directory_structure)} files: "

View File

@@ -78,6 +78,9 @@ def test_load_data_crawls_same_domain_links(mock_requests_get, mock_validate_url
sources = {doc.extra_info.get("source") for doc in result}
assert sources == {"http://example.com", "http://example.com/about"}
paths = {doc.extra_info.get("file_path") for doc in result}
assert paths == {"index.md", "about.md"}
texts = {doc.text for doc in result}
assert texts == {"Root content", "About content"}
@@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va
assert len(result) == 1
assert result[0].text == "Homepage"
assert result[0].extra_info == {"source": "http://example.com"}
assert result[0].extra_info == {
"source": "http://example.com",
"file_path": "index.md",
}
@patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url)
@@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url):
assert result == []
mock_validate_url.assert_called_once()
def test_url_to_virtual_path_variants():
crawler = CrawlerLoader()
assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md"
assert (
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup")
== "guides/setup.md"
)
assert (
crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/")
== "guides/setup.md"
)
assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"