diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index ee037e59..2a63f284 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -10,7 +10,7 @@ class CrawlerLoader(BaseRemote): self.limit = limit # Set the limit for the number of pages to scrape def load_data(self, inputs): - url = inputs['data'] + url = inputs # Check if the input is a list and if it is, use the first element if isinstance(url, list) and url: url = url[0] diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index 0748f104..6e9182c4 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -10,7 +10,7 @@ class SitemapLoader(BaseRemote): self.limit = limit # Adding limit to control the number of URLs to process def load_data(self, inputs): - sitemap_url= inputs['data'] + sitemap_url= inputs # Check if the input is a list and if it is, use the first element if isinstance(sitemap_url, list) and sitemap_url: url = sitemap_url[0] diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index e5cd2e2f..9fc50c1c 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -6,7 +6,7 @@ class WebLoader(BaseRemote): self.loader = WebBaseLoader def load_data(self, inputs): - urls = inputs['data'] + urls = inputs if isinstance(urls, str): urls = [urls] # Convert string to list if a single URL is passed diff --git a/application/worker.py b/application/worker.py index 50344a26..875611bf 100644 --- a/application/worker.py +++ b/application/worker.py @@ -138,19 +138,17 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader # source_data {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, source_data) - raw_docs = remote_loader.load_data() + remote_loader = RemoteCreator.create_loader(loader) + raw_docs = remote_loader.load_data(source_data) - raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) self.update_state(state='PROGRESS', meta={'current': 100}) + - if sample: - for i in range(min(5, len(raw_docs))): - print(raw_docs[i].text) # Proceed with uploading and cleaning as in the original function file_data = {'name': name_job, 'user': user} @@ -165,7 +163,7 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader shutil.rmtree(full_path) return { - 'urls': source_data['data'], + 'urls': source_data, 'name_job': name_job, 'user': user, 'limited': False