diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 86742572..592a82cd 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -5,7 +5,7 @@ from pymongo import MongoClient from bson.objectid import ObjectId from werkzeug.utils import secure_filename -from application.api.user.tasks import ingest +from application.api.user.tasks import ingest, ingest_remote from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator @@ -157,6 +157,32 @@ def upload_file(): return {"status": "ok", "task_id": task_id} else: return {"status": "error"} + +@user.route("/api/remote", methods=["POST"]) +def upload_remote(): + """Upload a remote source to get vectorized and indexed.""" + if "user" not in request.form: + return {"status": "no user"} + user = secure_filename(request.form["user"]) + if "source" not in request.form: + return {"status": "no source"} + source = secure_filename(request.form["source"]) + if "name" not in request.form: + return {"status": "no name"} + job_name = secure_filename(request.form["name"]) + # check if the post request has the file part + if "data" not in request.form: + print("No data") + return {"status": "no data"} + source_data = request.form["data"] + + if source_data: + task = ingest_remote.delay(source_data=source_data, job_name=job_name, user=user, loader=source) + # task id + task_id = task.id + return {"status": "ok", "task_id": task_id} + else: + return {"status": "error"} @user.route("/api/task_status", methods=["GET"]) def task_status(): diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index a3474939..4602bf85 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -1,7 +1,12 @@ -from application.worker import ingest_worker +from application.worker import ingest_worker, remote_worker from application.celery import celery @celery.task(bind=True) def ingest(self, directory, formats, name_job, filename, user): resp = ingest_worker(self, directory, formats, name_job, filename, user) return resp + +@celery.task(bind=True) +def ingest_remote(self, source_data, job_name, user, loader): + resp = remote_worker(self, source_data, job_name, user, loader) + return resp diff --git a/application/worker.py b/application/worker.py index 5fc28749..50344a26 100644 --- a/application/worker.py +++ b/application/worker.py @@ -123,7 +123,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): 'limited': False } -def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'url'): +def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'): sample = False token_check = True min_tokens = 150 @@ -135,10 +135,10 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur self.update_state(state='PROGRESS', meta={'current': 1}) - # inputs {"data": [url]} for url type task just urls + # source_data {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, inputs) + remote_loader = RemoteCreator.create_loader(loader, source_data) raw_docs = remote_loader.load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) @@ -165,7 +165,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur shutil.rmtree(full_path) return { - 'urls': inputs['data'], + 'urls': source_data['data'], 'name_job': name_job, 'user': user, 'limited': False