diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 80759593..21d97261 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -7,6 +7,7 @@ import logging from application.core.mongo_db import MongoDB from application.core.settings import settings from application.storage.storage_creator import StorageCreator +from application.utils import safe_filename logger = logging.getLogger(__name__) @@ -37,16 +38,18 @@ def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if "user" not in request.form: return {"status": "no user"} - user = secure_filename(request.form["user"]) + user = request.form["user"] if "name" not in request.form: return {"status": "no name"} - job_name = secure_filename(request.form["name"]) - tokens = secure_filename(request.form["tokens"]) - retriever = secure_filename(request.form["retriever"]) - id = secure_filename(request.form["id"]) - type = secure_filename(request.form["type"]) + job_name = request.form["name"] + tokens = request.form["tokens"] + retriever = request.form["retriever"] + id = request.form["id"] + type = request.form["type"] remote_data = request.form["remote_data"] if "remote_data" in request.form else None - sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None + sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None + + original_file_path = request.form.get("original_file_path") storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -85,6 +88,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "file_path": original_file_path, } }, ) @@ -102,6 +106,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "file_path": original_file_path, } ) return {"status": "ok"} diff --git a/application/api/user/routes.py b/application/api/user/routes.py index d98e4092..19716488 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -28,7 +28,7 @@ from application.core.settings import settings from application.extensions import api from application.storage.storage_creator import StorageCreator from application.tts.google_tts import GoogleTTS -from application.utils import check_required_fields, validate_function_name +from application.utils import check_required_fields, safe_filename, validate_function_name from application.vectorstore.vector_creator import VectorCreator storage = StorageCreator.get_storage() @@ -497,29 +497,30 @@ class UploadFile(Resource): ), 400, ) - user = secure_filename(decoded_token.get("sub")) - job_name = secure_filename(request.form["name"]) + user = decoded_token.get("sub") + job_name = request.form["name"] + + # Create safe versions for filesystem operations + safe_user = safe_filename(user) + dir_name = safe_filename(job_name) try: - from application.storage.storage_creator import StorageCreator - storage = StorageCreator.get_storage() - - base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" + base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" if len(files) > 1: temp_files = [] for file in files: - filename = secure_filename(file.filename) + filename = safe_filename(file.filename) temp_path = f"{base_path}/temp/{filename}" storage.save_file(file, temp_path) temp_files.append(temp_path) print(f"Saved file: {filename}") - zip_filename = f"{job_name}.zip" + zip_filename = f"{dir_name}.zip" zip_path = f"{base_path}/{zip_filename}" zip_temp_path = None - def create_zip_archive(temp_paths, job_name, storage): + def create_zip_archive(temp_paths, dir_name, storage): import tempfile with tempfile.NamedTemporaryFile( @@ -559,7 +560,7 @@ class UploadFile(Resource): return zip_output_path try: - zip_temp_path = create_zip_archive(temp_files, job_name, storage) + zip_temp_path = create_zip_archive(temp_files, dir_name, storage) with open(zip_temp_path, "rb") as zip_file: storage.save_file(zip_file, zip_path) task = ingest.delay( @@ -584,6 +585,8 @@ class UploadFile(Resource): job_name, zip_filename, user, + dir_name, + safe_user, ) finally: # Clean up temporary files @@ -604,7 +607,7 @@ class UploadFile(Resource): # For single file file = files[0] - filename = secure_filename(file.filename) + filename = safe_filename(file.filename) file_path = f"{base_path}/{filename}" storage.save_file(file, file_path) @@ -631,6 +634,8 @@ class UploadFile(Resource): job_name, filename, # Corrected variable for single-file case user, + dir_name, + safe_user, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) @@ -3457,7 +3462,7 @@ class StoreAttachment(Resource): jsonify({"status": "error", "message": "Missing file"}), 400, ) - user = secure_filename(decoded_token.get("sub")) + user = safe_filename(decoded_token.get("sub")) try: attachment_id = ObjectId() diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index fffa9ba9..c7003ef3 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -11,8 +11,8 @@ from application.worker import ( @celery.task(bind=True) -def ingest(self, directory, formats, name_job, filename, user): - resp = ingest_worker(self, directory, formats, name_job, filename, user) +def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir): + resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir) return resp diff --git a/application/utils.py b/application/utils.py index 7a9cfd2b..e749c788 100644 --- a/application/utils.py +++ b/application/utils.py @@ -1,8 +1,11 @@ import hashlib +import os import re +import uuid import tiktoken from flask import jsonify, make_response +from werkzeug.utils import secure_filename _encoding = None @@ -15,6 +18,31 @@ def get_encoding(): return _encoding +def safe_filename(filename): + """ + Creates a safe filename that preserves the original extension. + Uses secure_filename, but ensures a proper filename is returned even with non-Latin characters. + + Args: + filename (str): The original filename + + Returns: + str: A safe filename that can be used for storage + """ + if not filename: + return str(uuid.uuid4()) + + _, extension = os.path.splitext(filename) + + safe_name = secure_filename(filename) + + # If secure_filename returns just the extension or an empty string + if not safe_name or safe_name == extension.lstrip('.'): + return f"{str(uuid.uuid4())}{extension}" + + return safe_name + + def num_tokens_from_string(string: str) -> int: encoding = get_encoding() if isinstance(string, str): diff --git a/application/worker.py b/application/worker.py index b652109b..c6178931 100755 --- a/application/worker.py +++ b/application/worker.py @@ -194,7 +194,7 @@ def run_agent_logic(agent_config, input_data): # Define the main function for ingesting and processing documents. def ingest_worker( - self, directory, formats, name_job, filename, user, retriever="classic" + self, directory, formats, job_name, filename, user, dir_name=None, user_dir=None, retriever="classic" ): """ Ingest and process documents. @@ -203,9 +203,11 @@ def ingest_worker( self: Reference to the instance of the task. directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). - name_job (str): Name of the job for this ingestion task. + job_name (str): Name of the job for this ingestion task (original, unsanitized). filename (str): Name of the file to be ingested. - user (str): Identifier for the user initiating the ingestion. + user (str): Identifier for the user initiating the ingestion (original, unsanitized). + dir_name (str, optional): Sanitized directory name for filesystem operations. + user_dir (str, optional): Sanitized user ID for filesystem operations. retriever (str): Type of retriever to use for processing the documents. Returns: @@ -216,13 +218,13 @@ def ingest_worker( limit = None exclude = True sample = False - + storage = StorageCreator.get_storage() - full_path = os.path.join(directory, user, name_job) + full_path = os.path.join(directory, user_dir, dir_name) source_file_path = os.path.join(full_path, filename) - logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: @@ -283,13 +285,14 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { - "name": name_job, + "name": job_name, # Use original job_name "file": filename, - "user": user, + "user": user, # Use original user "tokens": tokens, "retriever": retriever, "id": str(id), "type": "local", + "original_file_path": source_file_path, } upload_index(vector_store_path, file_data) @@ -301,9 +304,9 @@ def ingest_worker( return { "directory": directory, "formats": formats, - "name_job": name_job, + "name_job": job_name, # Use original job_name "filename": filename, - "user": user, + "user": user, # Use original user "limited": False, }