feat: improve crawlers and update chunk filtering (#2250)

2026-02-11 16:51:04 +00:00 · 2026-01-05 22:52:12 +00:00
parent 5662be12b5
commit df57053613
5 changed files with 130 additions and 21 deletions
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import requests
 from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
                docs = loader.load()
                # Convert the loaded documents to your Document schema
                for doc in docs:
+                    metadata = dict(doc.metadata or {})
+                    source_url = metadata.get("source") or current_url
+                    metadata["file_path"] = self._url_to_virtual_path(source_url)
                    loaded_content.append(
                        Document(
                            doc.page_content,
-                            extra_info=doc.metadata
+                            extra_info=metadata
                        )
                    )
            except Exception as e:
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
                break

        return loaded_content
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        if not path.endswith(".md"):
+            path = f"{path}.md"
+
+        return path
--- a/application/parser/remote/crawler_markdown.py
+++ b/application/parser/remote/crawler_markdown.py
@@ -7,6 +7,7 @@ import re
 from markdownify import markdownify
 from application.parser.schema.base import Document
 import tldextract
+import os

 class CrawlerLoader(BaseRemote):
    def __init__(self, limit=10, allow_subdomains=False):
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
            # Convert the HTML to Markdown for cleaner text formatting
            title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
            if processed_markdown:
+                # Generate virtual file path from URL for consistent file-like matching
+                virtual_path = self._url_to_virtual_path(current_url)
+                
                # Create a Document for each visited page
                documents.append(
                    Document(
                        processed_markdown,  # content
                        None,  # doc_id
                        None,  # embedding
-                        {"source": current_url, "title": title, "language": language} # extra_info
+                        {
+                            "source": current_url,
+                            "title": title,
+                            "language": language,
+                            "file_path": virtual_path,
+                        },  # extra_info
                    )
                )

@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
                # Exact domain match
                if link_base == base_domain:
                    filtered.append(link)
-        return filtered
+        return filtered
+
+    def _url_to_virtual_path(self, url):
+        """
+        Convert a URL to a virtual file path ending with .md.
+
+        Examples:
+            https://docs.docsgpt.cloud/ -> index.md
+            https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
+            https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
+            https://example.com/page.html -> page.md
+        """
+        parsed = urlparse(url)
+        path = parsed.path.strip("/")
+
+        if not path:
+            return "index.md"
+
+        # Remove common file extensions and add .md
+        base, ext = os.path.splitext(path)
+        if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
+            path = base
+
+        # Ensure path ends with .md
+        if not path.endswith(".md"):
+            path = path + ".md"
+
+        return path