From 8b3e960be0105e1ff02297ecfd1a81164143f86c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 11 Jun 2025 16:00:09 +0530 Subject: [PATCH 1/5] (feat:ingestion) store filepath from now --- application/api/internal/routes.py | 4 ++++ application/worker.py | 1 + 2 files changed, 5 insertions(+) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 80759593..70c62877 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -47,6 +47,8 @@ def upload_index_files(): type = secure_filename(request.form["type"]) remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None + + original_file_path = request.form.get("original_file_path") storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -85,6 +87,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "file_path": original_file_path, } }, ) @@ -102,6 +105,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "file_path": original_file_path, ## relative path to the original file, for the storage referrence } ) return {"status": "ok"} diff --git a/application/worker.py b/application/worker.py index 285b13af..a73bd8f3 100755 --- a/application/worker.py +++ b/application/worker.py @@ -290,6 +290,7 @@ def ingest_worker( "retriever": retriever, "id": str(id), "type": "local", + "original_file_path": source_file_path, # Pass the original file path } upload_index(vector_store_path, file_data) From 142477ab9b2905669a752591e44eee25e2e97233 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 11 Jun 2025 21:03:38 +0530 Subject: [PATCH 2/5] (feat:safe_filename) handles case of non-ascii char --- application/utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/application/utils.py b/application/utils.py index 6d47d31a..548c8828 100644 --- a/application/utils.py +++ b/application/utils.py @@ -1,8 +1,11 @@ import hashlib +import os import re +import uuid import tiktoken from flask import jsonify, make_response +from werkzeug.utils import secure_filename _encoding = None @@ -15,6 +18,31 @@ def get_encoding(): return _encoding +def safe_filename(filename): + """ + Creates a safe filename that preserves the original extension. + Uses secure_filename, but ensures a proper filename is returned even with non-Latin characters. + + Args: + filename (str): The original filename + + Returns: + str: A safe filename that can be used for storage + """ + if not filename: + return str(uuid.uuid4()) + + _, extension = os.path.splitext(filename) + + safe_name = secure_filename(filename) + + # If secure_filename returns just the extension or an empty string + if not safe_name or safe_name == extension.lstrip('.'): + return f"{str(uuid.uuid4())}{extension}" + + return safe_name + + def num_tokens_from_string(string: str) -> int: encoding = get_encoding() if isinstance(string, str): From 78d5ed2ed2c69f2e07377fbafc8c74aaadcfba4b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 11 Jun 2025 21:04:50 +0530 Subject: [PATCH 3/5] (fix:ingestion) uuid for non-ascii filename --- application/api/internal/routes.py | 17 +++++++++-------- application/api/user/routes.py | 16 +++++++--------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 70c62877..2f60d8ea 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -7,6 +7,7 @@ import logging from application.core.mongo_db import MongoDB from application.core.settings import settings from application.storage.storage_creator import StorageCreator +from application.utils import safe_filename logger = logging.getLogger(__name__) @@ -37,18 +38,18 @@ def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if "user" not in request.form: return {"status": "no user"} - user = secure_filename(request.form["user"]) + user = safe_filename(request.form["user"]) if "name" not in request.form: return {"status": "no name"} - job_name = secure_filename(request.form["name"]) - tokens = secure_filename(request.form["tokens"]) - retriever = secure_filename(request.form["retriever"]) - id = secure_filename(request.form["id"]) - type = secure_filename(request.form["type"]) + job_name = safe_filename(request.form["name"]) + tokens = request.form["tokens"] + retriever = request.form["retriever"] + id = request.form["id"] + type = request.form["type"] remote_data = request.form["remote_data"] if "remote_data" in request.form else None - sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None + sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None - original_file_path = request.form.get("original_file_path") + original_file_path = request.form.get("original_file_path") # Already sanitized path storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" diff --git a/application/api/user/routes.py b/application/api/user/routes.py index d98e4092..dbcc751e 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -28,7 +28,7 @@ from application.core.settings import settings from application.extensions import api from application.storage.storage_creator import StorageCreator from application.tts.google_tts import GoogleTTS -from application.utils import check_required_fields, validate_function_name +from application.utils import check_required_fields, safe_filename, validate_function_name from application.vectorstore.vector_creator import VectorCreator storage = StorageCreator.get_storage() @@ -497,20 +497,17 @@ class UploadFile(Resource): ), 400, ) - user = secure_filename(decoded_token.get("sub")) - job_name = secure_filename(request.form["name"]) + user = safe_filename(decoded_token.get("sub")) + job_name = safe_filename(request.form["name"]) try: - from application.storage.storage_creator import StorageCreator - storage = StorageCreator.get_storage() - base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" if len(files) > 1: temp_files = [] for file in files: - filename = secure_filename(file.filename) + filename = safe_filename(file.filename) temp_path = f"{base_path}/temp/{filename}" storage.save_file(file, temp_path) temp_files.append(temp_path) @@ -604,7 +601,7 @@ class UploadFile(Resource): # For single file file = files[0] - filename = secure_filename(file.filename) + filename = safe_filename(file.filename) file_path = f"{base_path}/{filename}" storage.save_file(file, file_path) @@ -3457,7 +3454,8 @@ class StoreAttachment(Resource): jsonify({"status": "error", "message": "Missing file"}), 400, ) - user = secure_filename(decoded_token.get("sub")) + # Apply safe_filename to user ID + user = safe_filename(decoded_token.get("sub")) try: attachment_id = ObjectId() From 44b6ec25a2fc5e6f67790aa9e1b335d7444b6a8e Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 11 Jun 2025 21:18:37 +0530 Subject: [PATCH 4/5] clean --- application/api/internal/routes.py | 4 ++-- application/api/user/routes.py | 1 - application/worker.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 2f60d8ea..435b9cf4 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -49,7 +49,7 @@ def upload_index_files(): remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None - original_file_path = request.form.get("original_file_path") # Already sanitized path + original_file_path = request.form.get("original_file_path") storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -106,7 +106,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, - "file_path": original_file_path, ## relative path to the original file, for the storage referrence + "file_path": original_file_path, } ) return {"status": "ok"} diff --git a/application/api/user/routes.py b/application/api/user/routes.py index dbcc751e..e65b7891 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3454,7 +3454,6 @@ class StoreAttachment(Resource): jsonify({"status": "error", "message": "Missing file"}), 400, ) - # Apply safe_filename to user ID user = safe_filename(decoded_token.get("sub")) try: diff --git a/application/worker.py b/application/worker.py index a73bd8f3..85265308 100755 --- a/application/worker.py +++ b/application/worker.py @@ -290,7 +290,7 @@ def ingest_worker( "retriever": retriever, "id": str(id), "type": "local", - "original_file_path": source_file_path, # Pass the original file path + "original_file_path": source_file_path, } upload_index(vector_store_path, file_data) From b8a10e0962f817dafd3835b91e089086f38e3b66 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 12 Jun 2025 00:57:46 +0530 Subject: [PATCH 5/5] (fix:ingestion) display names are separate --- application/api/internal/routes.py | 4 ++-- application/api/user/routes.py | 20 ++++++++++++++------ application/api/user/tasks.py | 4 ++-- application/worker.py | 22 ++++++++++++---------- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 435b9cf4..21d97261 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -38,10 +38,10 @@ def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if "user" not in request.form: return {"status": "no user"} - user = safe_filename(request.form["user"]) + user = request.form["user"] if "name" not in request.form: return {"status": "no name"} - job_name = safe_filename(request.form["name"]) + job_name = request.form["name"] tokens = request.form["tokens"] retriever = request.form["retriever"] id = request.form["id"] diff --git a/application/api/user/routes.py b/application/api/user/routes.py index e65b7891..19716488 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -497,12 +497,16 @@ class UploadFile(Resource): ), 400, ) - user = safe_filename(decoded_token.get("sub")) - job_name = safe_filename(request.form["name"]) + user = decoded_token.get("sub") + job_name = request.form["name"] + + # Create safe versions for filesystem operations + safe_user = safe_filename(user) + dir_name = safe_filename(job_name) try: storage = StorageCreator.get_storage() - base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" + base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" if len(files) > 1: temp_files = [] @@ -512,11 +516,11 @@ class UploadFile(Resource): storage.save_file(file, temp_path) temp_files.append(temp_path) print(f"Saved file: {filename}") - zip_filename = f"{job_name}.zip" + zip_filename = f"{dir_name}.zip" zip_path = f"{base_path}/{zip_filename}" zip_temp_path = None - def create_zip_archive(temp_paths, job_name, storage): + def create_zip_archive(temp_paths, dir_name, storage): import tempfile with tempfile.NamedTemporaryFile( @@ -556,7 +560,7 @@ class UploadFile(Resource): return zip_output_path try: - zip_temp_path = create_zip_archive(temp_files, job_name, storage) + zip_temp_path = create_zip_archive(temp_files, dir_name, storage) with open(zip_temp_path, "rb") as zip_file: storage.save_file(zip_file, zip_path) task = ingest.delay( @@ -581,6 +585,8 @@ class UploadFile(Resource): job_name, zip_filename, user, + dir_name, + safe_user, ) finally: # Clean up temporary files @@ -628,6 +634,8 @@ class UploadFile(Resource): job_name, filename, # Corrected variable for single-file case user, + dir_name, + safe_user, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index fffa9ba9..c7003ef3 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -11,8 +11,8 @@ from application.worker import ( @celery.task(bind=True) -def ingest(self, directory, formats, name_job, filename, user): - resp = ingest_worker(self, directory, formats, name_job, filename, user) +def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir): + resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir) return resp diff --git a/application/worker.py b/application/worker.py index 85265308..235c969e 100755 --- a/application/worker.py +++ b/application/worker.py @@ -194,7 +194,7 @@ def run_agent_logic(agent_config, input_data): # Define the main function for ingesting and processing documents. def ingest_worker( - self, directory, formats, name_job, filename, user, retriever="classic" + self, directory, formats, job_name, filename, user, dir_name=None, user_dir=None, retriever="classic" ): """ Ingest and process documents. @@ -203,9 +203,11 @@ def ingest_worker( self: Reference to the instance of the task. directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). - name_job (str): Name of the job for this ingestion task. + job_name (str): Name of the job for this ingestion task (original, unsanitized). filename (str): Name of the file to be ingested. - user (str): Identifier for the user initiating the ingestion. + user (str): Identifier for the user initiating the ingestion (original, unsanitized). + dir_name (str, optional): Sanitized directory name for filesystem operations. + user_dir (str, optional): Sanitized user ID for filesystem operations. retriever (str): Type of retriever to use for processing the documents. Returns: @@ -216,13 +218,13 @@ def ingest_worker( limit = None exclude = True sample = False - + storage = StorageCreator.get_storage() - full_path = os.path.join(directory, user, name_job) + full_path = os.path.join(directory, user_dir, dir_name) source_file_path = os.path.join(full_path, filename) - logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: @@ -283,9 +285,9 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { - "name": name_job, + "name": job_name, # Use original job_name "file": filename, - "user": user, + "user": user, # Use original user "tokens": tokens, "retriever": retriever, "id": str(id), @@ -302,9 +304,9 @@ def ingest_worker( return { "directory": directory, "formats": formats, - "name_job": name_job, + "name_job": job_name, # Use original job_name "filename": filename, - "user": user, + "user": user, # Use original user "limited": False, }