Merge pull request #1582 from arc53/scraper-2

scraper with markdownify
This commit is contained in:
Alex
2025-01-15 12:15:37 +00:00
committed by GitHub
5 changed files with 211 additions and 57 deletions

View File

@@ -2105,4 +2105,4 @@ class DeleteTool(Resource):
except Exception as err: except Exception as err:
return {"success": False, "error": str(err)}, 400 return {"success": False, "error": str(err)}, 400
return {"success": True}, 200 return {"success": True}, 200

View File

@@ -2,16 +2,16 @@ import requests
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from application.parser.remote.base import BaseRemote from application.parser.remote.base import BaseRemote
from application.parser.schema.base import Document
from langchain_community.document_loaders import WebBaseLoader
class CrawlerLoader(BaseRemote): class CrawlerLoader(BaseRemote):
def __init__(self, limit=10): def __init__(self, limit=10):
from langchain_community.document_loaders import WebBaseLoader
self.loader = WebBaseLoader # Initialize the document loader self.loader = WebBaseLoader # Initialize the document loader
self.limit = limit # Set the limit for the number of pages to scrape self.limit = limit # Set the limit for the number of pages to scrape
def load_data(self, inputs): def load_data(self, inputs):
url = inputs url = inputs
# Check if the input is a list and if it is, use the first element
if isinstance(url, list) and url: if isinstance(url, list) and url:
url = url[0] url = url[0]
@@ -19,24 +19,29 @@ class CrawlerLoader(BaseRemote):
if not urlparse(url).scheme: if not urlparse(url).scheme:
url = "http://" + url url = "http://" + url
visited_urls = set() # Keep track of URLs that have been visited visited_urls = set()
base_url = urlparse(url).scheme + "://" + urlparse(url).hostname # Extract the base URL base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
urls_to_visit = [url] # List of URLs to be visited, starting with the initial URL urls_to_visit = [url]
loaded_content = [] # Store the loaded content from each URL loaded_content = []
# Continue crawling until there are no more URLs to visit
while urls_to_visit: while urls_to_visit:
current_url = urls_to_visit.pop(0) # Get the next URL to visit current_url = urls_to_visit.pop(0)
visited_urls.add(current_url) # Mark the URL as visited visited_urls.add(current_url)
# Try to load and process the content from the current URL
try: try:
response = requests.get(current_url) # Fetch the content of the current URL response = requests.get(current_url)
response.raise_for_status() # Raise an exception for HTTP errors response.raise_for_status()
loader = self.loader([current_url]) # Initialize the document loader for the current URL loader = self.loader([current_url])
loaded_content.extend(loader.load()) # Load the content and add it to the loaded_content list docs = loader.load()
# Convert the loaded documents to your Document schema
for doc in docs:
loaded_content.append(
Document(
doc.page_content,
extra_info=doc.metadata
)
)
except Exception as e: except Exception as e:
# Print an error message if loading or processing fails and continue with the next URL
print(f"Error processing URL {current_url}: {e}") print(f"Error processing URL {current_url}: {e}")
continue continue
@@ -45,15 +50,15 @@ class CrawlerLoader(BaseRemote):
all_links = [ all_links = [
urljoin(current_url, a['href']) urljoin(current_url, a['href'])
for a in soup.find_all('a', href=True) for a in soup.find_all('a', href=True)
if base_url in urljoin(current_url, a['href']) # Ensure links are from the same domain if base_url in urljoin(current_url, a['href'])
] ]
# Add new links to the list of URLs to visit if they haven't been visited yet # Add new links to the list of URLs to visit if they haven't been visited yet
urls_to_visit.extend([link for link in all_links if link not in visited_urls]) urls_to_visit.extend([link for link in all_links if link not in visited_urls])
urls_to_visit = list(set(urls_to_visit)) # Remove duplicate URLs urls_to_visit = list(set(urls_to_visit))
# Stop crawling if the limit of pages to scrape is reached # Stop crawling if the limit of pages to scrape is reached
if self.limit is not None and len(visited_urls) >= self.limit: if self.limit is not None and len(visited_urls) >= self.limit:
break break
return loaded_content # Return the loaded content from all visited URLs return loaded_content

View File

@@ -0,0 +1,139 @@
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from application.parser.remote.base import BaseRemote
import re
from markdownify import markdownify
from application.parser.schema.base import Document
import tldextract
class CrawlerLoader(BaseRemote):
def __init__(self, limit=10, allow_subdomains=False):
"""
Given a URL crawl web pages up to `self.limit`,
convert HTML content to Markdown, and returning a list of Document objects.
:param limit: The maximum number of pages to crawl.
:param allow_subdomains: If True, crawl pages on subdomains of the base domain.
"""
self.limit = limit
self.allow_subdomains = allow_subdomains
self.session = requests.Session()
def load_data(self, inputs):
url = inputs
if isinstance(url, list) and url:
url = url[0]
# Ensure the URL has a scheme (if not, default to http)
if not urlparse(url).scheme:
url = "http://" + url
# Keep track of visited URLs to avoid revisiting the same page
visited_urls = set()
# Determine the base domain for link filtering using tldextract
base_domain = self._get_base_domain(url)
urls_to_visit = {url}
documents = []
while urls_to_visit:
current_url = urls_to_visit.pop()
# Skip if already visited
if current_url in visited_urls:
continue
visited_urls.add(current_url)
# Fetch the page content
html_content = self._fetch_page(current_url)
if html_content is None:
continue
# Convert the HTML to Markdown for cleaner text formatting
title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url)
if processed_markdown:
# Create a Document for each visited page
documents.append(
Document(
processed_markdown, # content
None, # doc_id
None, # embedding
{"source": current_url, "title": title, "language": language} # extra_info
)
)
# Extract links and filter them according to domain rules
new_links = self._extract_links(html_content, current_url)
filtered_links = self._filter_links(new_links, base_domain)
# Add any new, not-yet-visited links to the queue
urls_to_visit.update(link for link in filtered_links if link not in visited_urls)
# If we've reached the limit, stop crawling
if self.limit is not None and len(visited_urls) >= self.limit:
break
return documents
def _fetch_page(self, url):
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching URL {url}: {e}")
return None
def _process_html_to_markdown(self, html_content, current_url):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else "No Title"
# Extract language
language_tag = soup.find('html')
language = language_tag.get('lang', 'en') if language_tag else "en"
markdownified = markdownify(html_content, heading_style="ATX", newline_style="BACKSLASH")
# Reduce sequences of more than two newlines to exactly three
markdownified = re.sub(r'\n{3,}', '\n\n\n', markdownified)
return title, language, markdownified
def _extract_links(self, html_content, current_url):
soup = BeautifulSoup(html_content, 'html.parser')
links = []
for a in soup.find_all('a', href=True):
full_url = urljoin(current_url, a['href'])
links.append((full_url, a.text.strip()))
return links
def _get_base_domain(self, url):
extracted = tldextract.extract(url)
# Reconstruct the domain as domain.suffix
base_domain = f"{extracted.domain}.{extracted.suffix}"
return base_domain
def _filter_links(self, links, base_domain):
"""
Filter the extracted links to only include those that match the crawling criteria:
- If allow_subdomains is True, allow any link whose domain ends with the base_domain.
- If allow_subdomains is False, only allow exact matches of the base_domain.
"""
filtered = []
for link, _ in links:
parsed_link = urlparse(link)
if not parsed_link.netloc:
continue
extracted = tldextract.extract(parsed_link.netloc)
link_base = f"{extracted.domain}.{extracted.suffix}"
if self.allow_subdomains:
# For subdomains: sub.example.com ends with example.com
if link_base == base_domain or link_base.endswith("." + base_domain):
filtered.append(link)
else:
# Exact domain match
if link_base == base_domain:
filtered.append(link)
return filtered

View File

@@ -86,4 +86,6 @@ urllib3==2.3.0
vine==5.1.0 vine==5.1.0
wcwidth==0.2.13 wcwidth==0.2.13
werkzeug==3.1.3 werkzeug==3.1.3
yarl==1.18.3 yarl==1.18.3
markdownify==0.14.1
tldextract==5.1.3

View File

@@ -203,53 +203,61 @@ def remote_worker(
sync_frequency="never", sync_frequency="never",
operation_mode="upload", operation_mode="upload",
doc_id=None, doc_id=None,
): ):
full_path = os.path.join(directory, user, name_job) full_path = os.path.join(directory, user, name_job)
if not os.path.exists(full_path): if not os.path.exists(full_path):
os.makedirs(full_path) os.makedirs(full_path)
self.update_state(state="PROGRESS", meta={"current": 1}) self.update_state(state="PROGRESS", meta={"current": 1})
logging.info( try:
f"Remote job: {full_path}", logging.info("Initializing remote loader with type: %s", loader)
extra={"user": user, "job": name_job, "source_data": source_data}, remote_loader = RemoteCreator.create_loader(loader)
) raw_docs = remote_loader.load_data(source_data)
remote_loader = RemoteCreator.create_loader(loader) chunker = Chunker(
raw_docs = remote_loader.load_data(source_data) chunking_strategy="classic_chunk",
max_tokens=MAX_TOKENS,
min_tokens=MIN_TOKENS,
duplicate_headers=False
)
docs = chunker.chunk(documents=raw_docs)
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
tokens = count_tokens_docs(docs)
logging.info("Total tokens calculated: %d", tokens)
chunker = Chunker( if operation_mode == "upload":
chunking_strategy="classic_chunk", id = ObjectId()
max_tokens=MAX_TOKENS, embed_and_store_documents(docs, full_path, id, self)
min_tokens=MIN_TOKENS, elif operation_mode == "sync":
duplicate_headers=False if not doc_id or not ObjectId.is_valid(doc_id):
) logging.error("Invalid doc_id provided for sync operation: %s", doc_id)
docs = chunker.chunk(documents=raw_docs) raise ValueError("doc_id must be provided for sync operation.")
id = ObjectId(doc_id)
embed_and_store_documents(docs, full_path, id, self)
tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100})
if operation_mode == "upload":
id = ObjectId()
embed_and_store_documents(docs, full_path, id, self)
elif operation_mode == "sync":
if not doc_id or not ObjectId.is_valid(doc_id):
raise ValueError("doc_id must be provided for sync operation.")
id = ObjectId(doc_id)
embed_and_store_documents(docs, full_path, id, self)
self.update_state(state="PROGRESS", meta={"current": 100})
file_data = { file_data = {
"name": name_job, "name": name_job,
"user": user, "user": user,
"tokens": tokens, "tokens": tokens,
"retriever": retriever, "retriever": retriever,
"id": str(id), "id": str(id),
"type": loader, "type": loader,
"remote_data": source_data, "remote_data": source_data,
"sync_frequency": sync_frequency, "sync_frequency": sync_frequency,
} }
upload_index(full_path, file_data) upload_index(full_path, file_data)
shutil.rmtree(full_path) except Exception as e:
logging.error("Error in remote_worker task: %s", str(e), exc_info=True)
raise
finally:
if os.path.exists(full_path):
shutil.rmtree(full_path)
logging.info("remote_worker task completed successfully")
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
def sync( def sync(