diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 435b9cf4..21d97261 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -38,10 +38,10 @@ def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if "user" not in request.form: return {"status": "no user"} - user = safe_filename(request.form["user"]) + user = request.form["user"] if "name" not in request.form: return {"status": "no name"} - job_name = safe_filename(request.form["name"]) + job_name = request.form["name"] tokens = request.form["tokens"] retriever = request.form["retriever"] id = request.form["id"] diff --git a/application/api/user/routes.py b/application/api/user/routes.py index e65b7891..19716488 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -497,12 +497,16 @@ class UploadFile(Resource): ), 400, ) - user = safe_filename(decoded_token.get("sub")) - job_name = safe_filename(request.form["name"]) + user = decoded_token.get("sub") + job_name = request.form["name"] + + # Create safe versions for filesystem operations + safe_user = safe_filename(user) + dir_name = safe_filename(job_name) try: storage = StorageCreator.get_storage() - base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" + base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" if len(files) > 1: temp_files = [] @@ -512,11 +516,11 @@ class UploadFile(Resource): storage.save_file(file, temp_path) temp_files.append(temp_path) print(f"Saved file: {filename}") - zip_filename = f"{job_name}.zip" + zip_filename = f"{dir_name}.zip" zip_path = f"{base_path}/{zip_filename}" zip_temp_path = None - def create_zip_archive(temp_paths, job_name, storage): + def create_zip_archive(temp_paths, dir_name, storage): import tempfile with tempfile.NamedTemporaryFile( @@ -556,7 +560,7 @@ class UploadFile(Resource): return zip_output_path try: - zip_temp_path = create_zip_archive(temp_files, job_name, storage) + zip_temp_path = create_zip_archive(temp_files, dir_name, storage) with open(zip_temp_path, "rb") as zip_file: storage.save_file(zip_file, zip_path) task = ingest.delay( @@ -581,6 +585,8 @@ class UploadFile(Resource): job_name, zip_filename, user, + dir_name, + safe_user, ) finally: # Clean up temporary files @@ -628,6 +634,8 @@ class UploadFile(Resource): job_name, filename, # Corrected variable for single-file case user, + dir_name, + safe_user, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index fffa9ba9..c7003ef3 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -11,8 +11,8 @@ from application.worker import ( @celery.task(bind=True) -def ingest(self, directory, formats, name_job, filename, user): - resp = ingest_worker(self, directory, formats, name_job, filename, user) +def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir): + resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir) return resp diff --git a/application/worker.py b/application/worker.py index 85265308..235c969e 100755 --- a/application/worker.py +++ b/application/worker.py @@ -194,7 +194,7 @@ def run_agent_logic(agent_config, input_data): # Define the main function for ingesting and processing documents. def ingest_worker( - self, directory, formats, name_job, filename, user, retriever="classic" + self, directory, formats, job_name, filename, user, dir_name=None, user_dir=None, retriever="classic" ): """ Ingest and process documents. @@ -203,9 +203,11 @@ def ingest_worker( self: Reference to the instance of the task. directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). - name_job (str): Name of the job for this ingestion task. + job_name (str): Name of the job for this ingestion task (original, unsanitized). filename (str): Name of the file to be ingested. - user (str): Identifier for the user initiating the ingestion. + user (str): Identifier for the user initiating the ingestion (original, unsanitized). + dir_name (str, optional): Sanitized directory name for filesystem operations. + user_dir (str, optional): Sanitized user ID for filesystem operations. retriever (str): Type of retriever to use for processing the documents. Returns: @@ -216,13 +218,13 @@ def ingest_worker( limit = None exclude = True sample = False - + storage = StorageCreator.get_storage() - full_path = os.path.join(directory, user, name_job) + full_path = os.path.join(directory, user_dir, dir_name) source_file_path = os.path.join(full_path, filename) - logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: @@ -283,9 +285,9 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { - "name": name_job, + "name": job_name, # Use original job_name "file": filename, - "user": user, + "user": user, # Use original user "tokens": tokens, "retriever": retriever, "id": str(id), @@ -302,9 +304,9 @@ def ingest_worker( return { "directory": directory, "formats": formats, - "name_job": name_job, + "name_job": job_name, # Use original job_name "filename": filename, - "user": user, + "user": user, # Use original user "limited": False, }