Refactor loader classes to accept inputs directly

2026-02-10 00:02:06 +00:00 · 2024-02-14 15:17:56 +00:00
parent 030c2a740f
commit 0cb3d12d94
4 changed files with 9 additions and 11 deletions
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -10,7 +10,7 @@ class CrawlerLoader(BaseRemote):
        self.limit = limit  # Set the limit for the number of pages to scrape

    def load_data(self, inputs):
-        url = inputs['data']
+        url = inputs
        # Check if the input is a list and if it is, use the first element
        if isinstance(url, list) and url:
            url = url[0]
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -10,7 +10,7 @@ class SitemapLoader(BaseRemote):
        self.limit = limit  # Adding limit to control the number of URLs to process

    def load_data(self, inputs):
-        sitemap_url= inputs['data']
+        sitemap_url= inputs
        # Check if the input is a list and if it is, use the first element
        if isinstance(sitemap_url, list) and sitemap_url:
            url = sitemap_url[0]
--- a/application/parser/remote/web_loader.py
+++ b/application/parser/remote/web_loader.py
@@ -6,7 +6,7 @@ class WebLoader(BaseRemote):
        self.loader = WebBaseLoader

    def load_data(self, inputs):
-        urls = inputs['data']
+        urls = inputs

        if isinstance(urls, str):
            urls = [urls] # Convert string to list if a single URL is passed
--- a/application/worker.py
+++ b/application/worker.py
@@ -138,19 +138,17 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader
    # source_data {"data": [url]} for url type task just urls
 
    # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, source_data)
-    raw_docs = remote_loader.load_data()
+    remote_loader = RemoteCreator.create_loader(loader)
+    raw_docs = remote_loader.load_data(source_data)

-    raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+    docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)

-    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+    #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

    call_openai_api(docs, full_path, self)
    self.update_state(state='PROGRESS', meta={'current': 100})
+    

-    if sample:
-        for i in range(min(5, len(raw_docs))):
-            print(raw_docs[i].text)

    # Proceed with uploading and cleaning as in the original function
    file_data = {'name': name_job, 'user': user}
@@ -165,7 +163,7 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader
    shutil.rmtree(full_path)

    return {
-        'urls': source_data['data'],
+        'urls': source_data,
        'name_job': name_job,
        'user': user,
        'limited': False