change input

This commit is contained in:
Pavel
2023-10-13 21:52:56 +04:00
parent 8b3b16bce4
commit 381a2740ee
4 changed files with 9 additions and 5 deletions

View File

@@ -9,7 +9,8 @@ class CrawlerLoader(BaseRemote):
self.loader = WebBaseLoader # Initialize the document loader
self.limit = limit # Set the limit for the number of pages to scrape
def load_data(self, url):
def load_data(self, inputs):
url = inputs['data']
# Check if the input is a list and if it is, use the first element
if isinstance(url, list) and url:
url = url[0]

View File

@@ -9,11 +9,12 @@ class SitemapLoader(BaseRemote):
self.loader = WebBaseLoader
self.limit = limit # Adding limit to control the number of URLs to process
def load_data(self, sitemap_url):
def load_data(self, inputs):
sitemap_url= inputs['data']
# Check if the input is a list and if it is, use the first element
if isinstance(sitemap_url, list) and sitemap_url:
url = sitemap_url[0]
urls = self._extract_urls(sitemap_url)
if not urls:
print(f"No URLs found in the sitemap: {sitemap_url}")

View File

@@ -5,7 +5,9 @@ class WebLoader(BaseRemote):
from langchain.document_loaders import WebBaseLoader
self.loader = WebBaseLoader
def load_data(self, urls):
def load_data(self, inputs):
urls = inputs['data']
if isinstance(urls, str):
urls = [urls] # Convert string to list if a single URL is passed

View File

@@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
# inputs {"data": [url]} for url type task just urls
# Use RemoteCreator to load data from URL
remote_loader = RemoteCreator.create_loader(loader, inputs['data'])
remote_loader = RemoteCreator.create_loader(loader, inputs)
raw_docs = remote_loader.load_data()
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)