mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-02-11 16:51:04 +00:00
feat: improve crawlers and update chunk filtering (#2250)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -47,10 +48,13 @@ class CrawlerLoader(BaseRemote):
|
||||
docs = loader.load()
|
||||
# Convert the loaded documents to your Document schema
|
||||
for doc in docs:
|
||||
metadata = dict(doc.metadata or {})
|
||||
source_url = metadata.get("source") or current_url
|
||||
metadata["file_path"] = self._url_to_virtual_path(source_url)
|
||||
loaded_content.append(
|
||||
Document(
|
||||
doc.page_content,
|
||||
extra_info=doc.metadata
|
||||
extra_info=metadata
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -74,3 +78,29 @@ class CrawlerLoader(BaseRemote):
|
||||
break
|
||||
|
||||
return loaded_content
|
||||
|
||||
def _url_to_virtual_path(self, url):
|
||||
"""
|
||||
Convert a URL to a virtual file path ending with .md.
|
||||
|
||||
Examples:
|
||||
https://docs.docsgpt.cloud/ -> index.md
|
||||
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
|
||||
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
|
||||
https://example.com/page.html -> page.md
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.strip("/")
|
||||
|
||||
if not path:
|
||||
return "index.md"
|
||||
|
||||
# Remove common file extensions and add .md
|
||||
base, ext = os.path.splitext(path)
|
||||
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
|
||||
path = base
|
||||
|
||||
if not path.endswith(".md"):
|
||||
path = f"{path}.md"
|
||||
|
||||
return path
|
||||
|
||||
@@ -7,6 +7,7 @@ import re
|
||||
from markdownify import markdownify
|
||||
from application.parser.schema.base import Document
|
||||
import tldextract
|
||||
import os
|
||||
|
||||
class CrawlerLoader(BaseRemote):
|
||||
def __init__(self, limit=10, allow_subdomains=False):
|
||||
@@ -57,13 +58,21 @@ class CrawlerLoader(BaseRemote):
|
||||
# Convert the HTML to Markdown for cleaner text formatting
|
||||
title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
|
||||
if processed_markdown:
|
||||
# Generate virtual file path from URL for consistent file-like matching
|
||||
virtual_path = self._url_to_virtual_path(current_url)
|
||||
|
||||
# Create a Document for each visited page
|
||||
documents.append(
|
||||
Document(
|
||||
processed_markdown, # content
|
||||
None, # doc_id
|
||||
None, # embedding
|
||||
{"source": current_url, "title": title, "language": language} # extra_info
|
||||
{
|
||||
"source": current_url,
|
||||
"title": title,
|
||||
"language": language,
|
||||
"file_path": virtual_path,
|
||||
}, # extra_info
|
||||
)
|
||||
)
|
||||
|
||||
@@ -145,4 +154,31 @@ class CrawlerLoader(BaseRemote):
|
||||
# Exact domain match
|
||||
if link_base == base_domain:
|
||||
filtered.append(link)
|
||||
return filtered
|
||||
return filtered
|
||||
|
||||
def _url_to_virtual_path(self, url):
|
||||
"""
|
||||
Convert a URL to a virtual file path ending with .md.
|
||||
|
||||
Examples:
|
||||
https://docs.docsgpt.cloud/ -> index.md
|
||||
https://docs.docsgpt.cloud/guides/setup -> guides/setup.md
|
||||
https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md
|
||||
https://example.com/page.html -> page.md
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.strip("/")
|
||||
|
||||
if not path:
|
||||
return "index.md"
|
||||
|
||||
# Remove common file extensions and add .md
|
||||
base, ext = os.path.splitext(path)
|
||||
if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]:
|
||||
path = base
|
||||
|
||||
# Ensure path ends with .md
|
||||
if not path.endswith(".md"):
|
||||
path = path + ".md"
|
||||
|
||||
return path
|
||||
Reference in New Issue
Block a user