feat: improve crawlers and update chunk filtering (#2250)

2026-03-07 06:15:10 +00:00 · 2026-01-05 22:52:12 +00:00
parent 5662be12b5
commit df57053613
5 changed files with 130 additions and 21 deletions
--- a/application/api/user/sources/chunks.py
+++ b/application/api/user/sources/chunks.py
@@ -55,9 +55,14 @@ class GetChunks(Resource):

                if path:
                    chunk_source = metadata.get("source", "")
-                    # Check if the chunk's source matches the requested path
+                    chunk_file_path = metadata.get("file_path", "")
+                    # Check if the chunk matches the requested path
+                    # For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf")
+                    # For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md")
+                    source_match = chunk_source and chunk_source.endswith(path)
+                    file_path_match = chunk_file_path and chunk_file_path.endswith(path)

-                    if not chunk_source or not chunk_source.endswith(path):
+                    if not (source_match or file_path_match):
                        continue
                # Filter by search term if provided

--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import requests
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
                docs = loader.load()
                # Convert the loaded documents to your Document schema
                for doc in docs:
+                    metadata = dict(doc.metadata or {})
+                    source_url = metadata.get("source") or current_url
+                    metadata["file_path"] = self._url_to_virtual_path(source_url)
                    loaded_content.append(
                        Document(
                            doc.page_content,
-                            extra_info=doc.metadata
+                            extra_info=metadata
                        )
                    )
            except Exception as e:
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
                break

        return loaded_content
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        if not path.endswith(".md"):
+            path = f"{path}.md"
+
+        return path
--- a/application/parser/remote/crawler_markdown.py
+++ b/application/parser/remote/crawler_markdown.py
@@ -7,6 +7,7 @@ import re
 from markdownify import markdownify
 from application.parser.schema.base import Document
 import tldextract
+import os

 class CrawlerLoader(BaseRemote):
    def __init__(self, limit=10, allow_subdomains=False):
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
            # Convert the HTML to Markdown for cleaner text formatting
            title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
            if processed_markdown:
+                # Generate virtual file path from URL for consistent file-like matching
+                virtual_path = self._url_to_virtual_path(current_url)
+                
                # Create a Document for each visited page
                documents.append(
                    Document(
                        processed_markdown,  # content
                        None,  # doc_id
                        None,  # embedding
-                        {"source": current_url, "title": title, "language": language} # extra_info
+                        {
+                            "source": current_url,
+                            "title": title,
+                            "language": language,
+                            "file_path": virtual_path,
+                        },  # extra_info
                    )
                )

@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
                # Exact domain match
                if link_base == base_domain:
                    filtered.append(link)
-        return filtered
+        return filtered
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        # Ensure path ends with .md
+        if not path.endswith(".md"):
+            path = path + ".md"
+
+        return path
--- a/application/worker.py
+++ b/application/worker.py
@@ -869,27 +869,33 @@ def remote_worker(
        logging.info("Total tokens calculated: %d", tokens)

        # Build directory structure from loaded documents
-        # Format matches local file uploads: flat structure with type, size_bytes, token_count
+        # Format matches local file uploads: nested structure with type, size_bytes, token_count
        directory_structure = {}
        for doc in raw_docs:
-            # Get the file path/name from doc_id or extra_info
-            file_path = doc.doc_id or ""
-            if not file_path and doc.extra_info:
-                file_path = doc.extra_info.get("key", "") or doc.extra_info.get(
-                    "title", ""
+            # Get the file path from extra_info
+            # For crawlers: file_path is a virtual path like "guides/setup.md"
+            # For other remotes: use key or title as fallback
+            file_path = ""
+            if doc.extra_info:
+                file_path = (
+                    doc.extra_info.get("file_path", "")
+                    or doc.extra_info.get("key", "")
+                    or doc.extra_info.get("title", "")
                )
+            if not file_path:
+                file_path = doc.doc_id or ""

            if file_path:
-                # Use just the filename (last part of path) for flat structure
-                file_name = file_path.split("/")[-1] if "/" in file_path else file_path
-
                # Calculate token count
-                token_count = len(doc.text.split()) if doc.text else 0
+                token_count = num_tokens_from_string(doc.text) if doc.text else 0

                # Estimate size in bytes from text content
                size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0

                # Guess mime type from extension
+                file_name = (
+                    file_path.split("/")[-1] if "/" in file_path else file_path
+                )
                ext = os.path.splitext(file_name)[1].lower()
                mime_types = {
                    ".txt": "text/plain",
@@ -909,11 +915,23 @@ def remote_worker(
                }
                file_type = mime_types.get(ext, "application/octet-stream")

-                directory_structure[file_name] = {
-                    "type": file_type,
-                    "size_bytes": size_bytes,
-                    "token_count": token_count,
-                }
+                # Build nested directory structure from path
+                # e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
+                path_parts = file_path.split("/")
+                current_level = directory_structure
+                for i, part in enumerate(path_parts):
+                    if i == len(path_parts) - 1:
+                        # Last part is the file
+                        current_level[part] = {
+                            "type": file_type,
+                            "size_bytes": size_bytes,
+                            "token_count": token_count,
+                        }
+                    else:
+                        # Intermediate parts are directories
+                        if part not in current_level:
+                            current_level[part] = {}
+                        current_level = current_level[part]

        logging.info(
            f"Built directory structure with {len(directory_structure)} files: "
--- a/tests/parser/remote/test_crawler_loader.py
+++ b/tests/parser/remote/test_crawler_loader.py
@@ -78,6 +78,9 @@ def test_load_data_crawls_same_domain_links(mock_requests_get, mock_validate_url
    sources = {doc.extra_info.get("source") for doc in result}
    assert sources == {"http://example.com", "http://example.com/about"}

+    paths = {doc.extra_info.get("file_path") for doc in result}
+    assert paths == {"index.md", "about.md"}
+
    texts = {doc.text for doc in result}
    assert texts == {"Root content", "About content"}

@@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va

    assert len(result) == 1
    assert result[0].text == "Homepage"
-    assert result[0].extra_info == {"source": "http://example.com"}
+    assert result[0].extra_info == {
+        "source": "http://example.com",
+        "file_path": "index.md",
+    }


@patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url)
@@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url):
    assert result == []
    mock_validate_url.assert_called_once()

+
+def test_url_to_virtual_path_variants():
+    crawler = CrawlerLoader()
+
+    assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md"
+    assert (
+        crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup")
+        == "guides/setup.md"
+    )
+    assert (
+        crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/")
+        == "guides/setup.md"
+    )
+    assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"