From 658867cb46e253b8ae8b128f81ea5e50d999d613 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 01:03:40 +0400 Subject: [PATCH] No crawler, no sitemap --- application/parser/remote/base.py | 2 +- application/parser/remote/crawler_loader.py | 0 application/parser/remote/github_loader.py | 0 application/parser/remote/remote_creator.py | 18 ++++++++ application/parser/remote/sitemap_loader.py | 0 application/parser/remote/web_loader.py | 10 +++++ application/worker.py | 47 +++++++++++++++++++++ 7 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 application/parser/remote/crawler_loader.py create mode 100644 application/parser/remote/github_loader.py create mode 100644 application/parser/remote/remote_creator.py create mode 100644 application/parser/remote/sitemap_loader.py create mode 100644 application/parser/remote/web_loader.py diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py index 91313f22..75ae34d5 100644 --- a/application/parser/remote/base.py +++ b/application/parser/remote/base.py @@ -1,6 +1,6 @@ """Base reader class.""" from abc import abstractmethod -from typing import Any, List +from typing import Any, List, Iterator from langchain.docstore.document import Document as LCDocument from application.parser.schema.base import Document diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py new file mode 100644 index 00000000..e12b7a02 --- /dev/null +++ b/application/parser/remote/remote_creator.py @@ -0,0 +1,18 @@ +# from sitemap_loader import SitemapLoader +# from crawler_loader import CrawlerLoader +from application.parser.remote.web_loader import WebLoader + + +class RemoteCreator: + loaders = { + 'url': WebLoader, + # 'sitemap': SitemapLoader, + # 'crawler': CrawlerLoader + } + + @classmethod + def create_loader(cls, type, *args, **kwargs): + loader_class = cls.loaders.get(type.lower()) + if not loader_class: + raise ValueError(f"No LLM class found for type {type}") + return loader_class(*args, **kwargs) \ No newline at end of file diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py new file mode 100644 index 00000000..ad2847e2 --- /dev/null +++ b/application/parser/remote/web_loader.py @@ -0,0 +1,10 @@ +from application.parser.remote.base import BaseRemote + +class WebLoader(BaseRemote): + def __init__(self): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader + + def load_data(self, urls): + loader = self.loader(urls) + return loader.load() \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index 71fcd615..fe4e2615 100644 --- a/application/worker.py +++ b/application/worker.py @@ -9,6 +9,7 @@ import requests from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.remote.remote_creator import RemoteCreator from application.parser.open_ai_func import call_openai_api from application.parser.schema.base import Document from application.parser.token_func import group_split @@ -104,3 +105,49 @@ def ingest_worker(self, directory, formats, name_job, filename, user): 'user': user, 'limited': False } + +def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'): + sample = False + token_check = True + min_tokens = 150 + max_tokens = 1250 + full_path = directory + '/' + user + '/' + name_job + + if not os.path.exists(full_path): + os.makedirs(full_path) + + self.update_state(state='PROGRESS', meta={'current': 1}) + + # Use RemoteCreator to load data from URL + remote_loader = RemoteCreator.create_loader(loader, urls) + raw_docs = remote_loader.load_data() + + raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + call_openai_api(docs, full_path, self) + self.update_state(state='PROGRESS', meta={'current': 100}) + + if sample: + for i in range(min(5, len(raw_docs))): + print(raw_docs[i].text) + + # Proceed with uploading and cleaning as in the original function + file_data = {'name': name_job, 'user': user} + if settings.VECTOR_STORE == "faiss": + files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), + 'file_pkl': open(full_path + '/index.pkl', 'rb')} + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + else: + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + + shutil.rmtree(full_path) + + return { + 'urls': urls, + 'name_job': name_job, + 'user': user, + 'limited': False + } \ No newline at end of file