From df570536135e17ffe28bb8c8cc5283a3bf267f7e Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 5 Jan 2026 22:52:12 +0000 Subject: [PATCH] feat: improve crawlers and update chunk filtering (#2250) --- application/api/user/sources/chunks.py | 9 +++- application/parser/remote/crawler_loader.py | 32 ++++++++++++- application/parser/remote/crawler_markdown.py | 40 +++++++++++++++- application/worker.py | 48 +++++++++++++------ tests/parser/remote/test_crawler_loader.py | 22 ++++++++- 5 files changed, 130 insertions(+), 21 deletions(-) diff --git a/application/api/user/sources/chunks.py b/application/api/user/sources/chunks.py index 44afb13b..5356c721 100644 --- a/application/api/user/sources/chunks.py +++ b/application/api/user/sources/chunks.py @@ -55,9 +55,14 @@ class GetChunks(Resource): if path: chunk_source = metadata.get("source", "") - # Check if the chunk's source matches the requested path + chunk_file_path = metadata.get("file_path", "") + # Check if the chunk matches the requested path + # For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf") + # For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md") + source_match = chunk_source and chunk_source.endswith(path) + file_path_match = chunk_file_path and chunk_file_path.endswith(path) - if not chunk_source or not chunk_source.endswith(path): + if not (source_match or file_path_match): continue # Filter by search term if provided diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 1bfd2276..fed69978 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -1,4 +1,5 @@ import logging +import os import requests from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup @@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote): docs = loader.load() # Convert the loaded documents to your Document schema for doc in docs: + metadata = dict(doc.metadata or {}) + source_url = metadata.get("source") or current_url + metadata["file_path"] = self._url_to_virtual_path(source_url) loaded_content.append( Document( doc.page_content, - extra_info=doc.metadata + extra_info=metadata ) ) except Exception as e: @@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote): break return loaded_content + + def _url_to_virtual_path(self, url): + """ + Convert a URL to a virtual file path ending with .md. + + Examples: + https://docs.docsgpt.cloud/ -> index.md + https://docs.docsgpt.cloud/guides/setup -> guides/setup.md + https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md + https://example.com/page.html -> page.md + """ + parsed = urlparse(url) + path = parsed.path.strip("/") + + if not path: + return "index.md" + + # Remove common file extensions and add .md + base, ext = os.path.splitext(path) + if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]: + path = base + + if not path.endswith(".md"): + path = f"{path}.md" + + return path diff --git a/application/parser/remote/crawler_markdown.py b/application/parser/remote/crawler_markdown.py index 8fc4c92c..b037dece 100644 --- a/application/parser/remote/crawler_markdown.py +++ b/application/parser/remote/crawler_markdown.py @@ -7,6 +7,7 @@ import re from markdownify import markdownify from application.parser.schema.base import Document import tldextract +import os class CrawlerLoader(BaseRemote): def __init__(self, limit=10, allow_subdomains=False): @@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote): # Convert the HTML to Markdown for cleaner text formatting title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url) if processed_markdown: + # Generate virtual file path from URL for consistent file-like matching + virtual_path = self._url_to_virtual_path(current_url) + # Create a Document for each visited page documents.append( Document( processed_markdown, # content None, # doc_id None, # embedding - {"source": current_url, "title": title, "language": language} # extra_info + { + "source": current_url, + "title": title, + "language": language, + "file_path": virtual_path, + }, # extra_info ) ) @@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote): # Exact domain match if link_base == base_domain: filtered.append(link) - return filtered \ No newline at end of file + return filtered + + def _url_to_virtual_path(self, url): + """ + Convert a URL to a virtual file path ending with .md. + + Examples: + https://docs.docsgpt.cloud/ -> index.md + https://docs.docsgpt.cloud/guides/setup -> guides/setup.md + https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md + https://example.com/page.html -> page.md + """ + parsed = urlparse(url) + path = parsed.path.strip("/") + + if not path: + return "index.md" + + # Remove common file extensions and add .md + base, ext = os.path.splitext(path) + if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]: + path = base + + # Ensure path ends with .md + if not path.endswith(".md"): + path = path + ".md" + + return path \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index 1fa39e3c..a4265bc1 100755 --- a/application/worker.py +++ b/application/worker.py @@ -869,27 +869,33 @@ def remote_worker( logging.info("Total tokens calculated: %d", tokens) # Build directory structure from loaded documents - # Format matches local file uploads: flat structure with type, size_bytes, token_count + # Format matches local file uploads: nested structure with type, size_bytes, token_count directory_structure = {} for doc in raw_docs: - # Get the file path/name from doc_id or extra_info - file_path = doc.doc_id or "" - if not file_path and doc.extra_info: - file_path = doc.extra_info.get("key", "") or doc.extra_info.get( - "title", "" + # Get the file path from extra_info + # For crawlers: file_path is a virtual path like "guides/setup.md" + # For other remotes: use key or title as fallback + file_path = "" + if doc.extra_info: + file_path = ( + doc.extra_info.get("file_path", "") + or doc.extra_info.get("key", "") + or doc.extra_info.get("title", "") ) + if not file_path: + file_path = doc.doc_id or "" if file_path: - # Use just the filename (last part of path) for flat structure - file_name = file_path.split("/")[-1] if "/" in file_path else file_path - # Calculate token count - token_count = len(doc.text.split()) if doc.text else 0 + token_count = num_tokens_from_string(doc.text) if doc.text else 0 # Estimate size in bytes from text content size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0 # Guess mime type from extension + file_name = ( + file_path.split("/")[-1] if "/" in file_path else file_path + ) ext = os.path.splitext(file_name)[1].lower() mime_types = { ".txt": "text/plain", @@ -909,11 +915,23 @@ def remote_worker( } file_type = mime_types.get(ext, "application/octet-stream") - directory_structure[file_name] = { - "type": file_type, - "size_bytes": size_bytes, - "token_count": token_count, - } + # Build nested directory structure from path + # e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}} + path_parts = file_path.split("/") + current_level = directory_structure + for i, part in enumerate(path_parts): + if i == len(path_parts) - 1: + # Last part is the file + current_level[part] = { + "type": file_type, + "size_bytes": size_bytes, + "token_count": token_count, + } + else: + # Intermediate parts are directories + if part not in current_level: + current_level[part] = {} + current_level = current_level[part] logging.info( f"Built directory structure with {len(directory_structure)} files: " diff --git a/tests/parser/remote/test_crawler_loader.py b/tests/parser/remote/test_crawler_loader.py index 06d27517..8c9a97a3 100644 --- a/tests/parser/remote/test_crawler_loader.py +++ b/tests/parser/remote/test_crawler_loader.py @@ -78,6 +78,9 @@ def test_load_data_crawls_same_domain_links(mock_requests_get, mock_validate_url sources = {doc.extra_info.get("source") for doc in result} assert sources == {"http://example.com", "http://example.com/about"} + paths = {doc.extra_info.get("file_path") for doc in result} + assert paths == {"index.md", "about.md"} + texts = {doc.text for doc in result} assert texts == {"Root content", "About content"} @@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va assert len(result) == 1 assert result[0].text == "Homepage" - assert result[0].extra_info == {"source": "http://example.com"} + assert result[0].extra_info == { + "source": "http://example.com", + "file_path": "index.md", + } @patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url) @@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url): assert result == [] mock_validate_url.assert_called_once() + +def test_url_to_virtual_path_variants(): + crawler = CrawlerLoader() + + assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md" + assert ( + crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup") + == "guides/setup.md" + ) + assert ( + crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/") + == "guides/setup.md" + ) + assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"