From df570536135e17ffe28bb8c8cc5283a3bf267f7e Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Mon, 5 Jan 2026 22:52:12 +0000
Subject: [PATCH] feat: improve crawlers and update chunk filtering (#2250)

---
 application/api/user/sources/chunks.py        |  9 +++-
 application/parser/remote/crawler_loader.py   | 32 ++++++++++++-
 application/parser/remote/crawler_markdown.py | 40 +++++++++++++++-
 application/worker.py                         | 48 +++++++++++++------
 tests/parser/remote/test_crawler_loader.py    | 22 ++++++++-
 5 files changed, 130 insertions(+), 21 deletions(-)

diff --git a/application/api/user/sources/chunks.py b/application/api/user/sources/chunks.py
index 44afb13b..5356c721 100644
--- a/application/api/user/sources/chunks.py
+++ b/application/api/user/sources/chunks.py
@@ -55,9 +55,14 @@ class GetChunks(Resource):
 
                 if path:
                     chunk_source = metadata.get("source", "")
-                    # Check if the chunk's source matches the requested path
+                    chunk_file_path = metadata.get("file_path", "")
+                    # Check if the chunk matches the requested path
+                    # For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf")
+                    # For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md")
+                    source_match = chunk_source and chunk_source.endswith(path)
+                    file_path_match = chunk_file_path and chunk_file_path.endswith(path)
 
-                    if not chunk_source or not chunk_source.endswith(path):
+                    if not (source_match or file_path_match):
                         continue
                 # Filter by search term if provided
 
diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index 1bfd2276..fed69978 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import requests
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
                 docs = loader.load()
                 # Convert the loaded documents to your Document schema
                 for doc in docs:
+                    metadata = dict(doc.metadata or {})
+                    source_url = metadata.get("source") or current_url
+                    metadata["file_path"] = self._url_to_virtual_path(source_url)
                     loaded_content.append(
                         Document(
                             doc.page_content,
-                            extra_info=doc.metadata
+                            extra_info=metadata
                         )
                     )
             except Exception as e:
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
                 break
 
         return loaded_content
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        if not path.endswith(".md"):
+            path = f"{path}.md"
+
+        return path
diff --git a/application/parser/remote/crawler_markdown.py b/application/parser/remote/crawler_markdown.py
index 8fc4c92c..b037dece 100644
--- a/application/parser/remote/crawler_markdown.py
+++ b/application/parser/remote/crawler_markdown.py
@@ -7,6 +7,7 @@ import re
 from markdownify import markdownify
 from application.parser.schema.base import Document
 import tldextract
+import os
 
 class CrawlerLoader(BaseRemote):
     def __init__(self, limit=10, allow_subdomains=False):
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
             # Convert the HTML to Markdown for cleaner text formatting
             title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
             if processed_markdown:
+                # Generate virtual file path from URL for consistent file-like matching
+                virtual_path = self._url_to_virtual_path(current_url)
+                
                 # Create a Document for each visited page
                 documents.append(
                     Document(
                         processed_markdown,  # content
                         None,  # doc_id
                         None,  # embedding
-                        {"source": current_url, "title": title, "language": language} # extra_info
+                        {
+                            "source": current_url,
+                            "title": title,
+                            "language": language,
+                            "file_path": virtual_path,
+                        },  # extra_info
                     )
                 )
 
@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
                 # Exact domain match
                 if link_base == base_domain:
                     filtered.append(link)
-        return filtered
\ No newline at end of file
+        return filtered
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        # Ensure path ends with .md
+        if not path.endswith(".md"):
+            path = path + ".md"
+
+        return path
\ No newline at end of file
diff --git a/application/worker.py b/application/worker.py
index 1fa39e3c..a4265bc1 100755
--- a/application/worker.py
+++ b/application/worker.py
@@ -869,27 +869,33 @@ def remote_worker(
         logging.info("Total tokens calculated: %d", tokens)
 
         # Build directory structure from loaded documents
-        # Format matches local file uploads: flat structure with type, size_bytes, token_count
+        # Format matches local file uploads: nested structure with type, size_bytes, token_count
         directory_structure = {}
         for doc in raw_docs:
-            # Get the file path/name from doc_id or extra_info
-            file_path = doc.doc_id or ""
-            if not file_path and doc.extra_info:
-                file_path = doc.extra_info.get("key", "") or doc.extra_info.get(
-                    "title", ""
+            # Get the file path from extra_info
+            # For crawlers: file_path is a virtual path like "guides/setup.md"
+            # For other remotes: use key or title as fallback
+            file_path = ""
+            if doc.extra_info:
+                file_path = (
+                    doc.extra_info.get("file_path", "")
+                    or doc.extra_info.get("key", "")
+                    or doc.extra_info.get("title", "")
                 )
+            if not file_path:
+                file_path = doc.doc_id or ""
 
             if file_path:
-                # Use just the filename (last part of path) for flat structure
-                file_name = file_path.split("/")[-1] if "/" in file_path else file_path
-
                 # Calculate token count
-                token_count = len(doc.text.split()) if doc.text else 0
+                token_count = num_tokens_from_string(doc.text) if doc.text else 0
 
                 # Estimate size in bytes from text content
                 size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0
 
                 # Guess mime type from extension
+                file_name = (
+                    file_path.split("/")[-1] if "/" in file_path else file_path
+                )
                 ext = os.path.splitext(file_name)[1].lower()
                 mime_types = {
                     ".txt": "text/plain",
@@ -909,11 +915,23 @@ def remote_worker(
                 }
                 file_type = mime_types.get(ext, "application/octet-stream")
 
-                directory_structure[file_name] = {
-                    "type": file_type,
-                    "size_bytes": size_bytes,
-                    "token_count": token_count,
-                }
+                # Build nested directory structure from path
+                # e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}}
+                path_parts = file_path.split("/")
+                current_level = directory_structure
+                for i, part in enumerate(path_parts):
+                    if i == len(path_parts) - 1:
+                        # Last part is the file
+                        current_level[part] = {
+                            "type": file_type,
+                            "size_bytes": size_bytes,
+                            "token_count": token_count,
+                        }
+                    else:
+                        # Intermediate parts are directories
+                        if part not in current_level:
+                            current_level[part] = {}
+                        current_level = current_level[part]
 
         logging.info(
             f"Built directory structure with {len(directory_structure)} files: "
diff --git a/tests/parser/remote/test_crawler_loader.py b/tests/parser/remote/test_crawler_loader.py
index 06d27517..8c9a97a3 100644
--- a/tests/parser/remote/test_crawler_loader.py
+++ b/tests/parser/remote/test_crawler_loader.py
@@ -78,6 +78,9 @@ def test_load_data_crawls_same_domain_links(mock_requests_get, mock_validate_url
     sources = {doc.extra_info.get("source") for doc in result}
     assert sources == {"http://example.com", "http://example.com/about"}
 
+    paths = {doc.extra_info.get("file_path") for doc in result}
+    assert paths == {"index.md", "about.md"}
+
     texts = {doc.text for doc in result}
     assert texts == {"Root content", "About content"}
 
@@ -107,7 +110,10 @@ def test_load_data_accepts_list_input_and_adds_scheme(mock_requests_get, mock_va
 
     assert len(result) == 1
     assert result[0].text == "Homepage"
-    assert result[0].extra_info == {"source": "http://example.com"}
+    assert result[0].extra_info == {
+        "source": "http://example.com",
+        "file_path": "index.md",
+    }
 
 
 @patch("application.parser.remote.crawler_loader.validate_url", side_effect=_mock_validate_url)
@@ -190,3 +196,17 @@ def test_load_data_returns_empty_on_ssrf_validation_failure(mock_validate_url):
     assert result == []
     mock_validate_url.assert_called_once()
 
+
+def test_url_to_virtual_path_variants():
+    crawler = CrawlerLoader()
+
+    assert crawler._url_to_virtual_path("https://docs.docsgpt.cloud/") == "index.md"
+    assert (
+        crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup")
+        == "guides/setup.md"
+    )
+    assert (
+        crawler._url_to_virtual_path("https://docs.docsgpt.cloud/guides/setup/")
+        == "guides/setup.md"
+    )
+    assert crawler._url_to_virtual_path("https://example.com/page.html") == "page.md"