Merge pull request #1838 from ManishMadan2882/main

Fixes ingestion of file with non-ascii characters in name
This commit is contained in:
Alex
2025-06-12 09:52:03 +01:00
committed by GitHub
5 changed files with 73 additions and 32 deletions

View File

@@ -7,6 +7,7 @@ import logging
from application.core.mongo_db import MongoDB
from application.core.settings import settings
from application.storage.storage_creator import StorageCreator
from application.utils import safe_filename
logger = logging.getLogger(__name__)
@@ -37,16 +38,18 @@ def upload_index_files():
"""Upload two files(index.faiss, index.pkl) to the user's folder."""
if "user" not in request.form:
return {"status": "no user"}
user = secure_filename(request.form["user"])
user = request.form["user"]
if "name" not in request.form:
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
tokens = secure_filename(request.form["tokens"])
retriever = secure_filename(request.form["retriever"])
id = secure_filename(request.form["id"])
type = secure_filename(request.form["type"])
job_name = request.form["name"]
tokens = request.form["tokens"]
retriever = request.form["retriever"]
id = request.form["id"]
type = request.form["type"]
remote_data = request.form["remote_data"] if "remote_data" in request.form else None
sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None
sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
original_file_path = request.form.get("original_file_path")
storage = StorageCreator.get_storage()
index_base_path = f"indexes/{id}"
@@ -85,6 +88,7 @@ def upload_index_files():
"retriever": retriever,
"remote_data": remote_data,
"sync_frequency": sync_frequency,
"file_path": original_file_path,
}
},
)
@@ -102,6 +106,7 @@ def upload_index_files():
"retriever": retriever,
"remote_data": remote_data,
"sync_frequency": sync_frequency,
"file_path": original_file_path,
}
)
return {"status": "ok"}

View File

@@ -28,7 +28,7 @@ from application.core.settings import settings
from application.extensions import api
from application.storage.storage_creator import StorageCreator
from application.tts.google_tts import GoogleTTS
from application.utils import check_required_fields, validate_function_name
from application.utils import check_required_fields, safe_filename, validate_function_name
from application.vectorstore.vector_creator import VectorCreator
storage = StorageCreator.get_storage()
@@ -497,29 +497,30 @@ class UploadFile(Resource):
),
400,
)
user = secure_filename(decoded_token.get("sub"))
job_name = secure_filename(request.form["name"])
user = decoded_token.get("sub")
job_name = request.form["name"]
# Create safe versions for filesystem operations
safe_user = safe_filename(user)
dir_name = safe_filename(job_name)
try:
from application.storage.storage_creator import StorageCreator
storage = StorageCreator.get_storage()
base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}"
base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
if len(files) > 1:
temp_files = []
for file in files:
filename = secure_filename(file.filename)
filename = safe_filename(file.filename)
temp_path = f"{base_path}/temp/{filename}"
storage.save_file(file, temp_path)
temp_files.append(temp_path)
print(f"Saved file: {filename}")
zip_filename = f"{job_name}.zip"
zip_filename = f"{dir_name}.zip"
zip_path = f"{base_path}/{zip_filename}"
zip_temp_path = None
def create_zip_archive(temp_paths, job_name, storage):
def create_zip_archive(temp_paths, dir_name, storage):
import tempfile
with tempfile.NamedTemporaryFile(
@@ -559,7 +560,7 @@ class UploadFile(Resource):
return zip_output_path
try:
zip_temp_path = create_zip_archive(temp_files, job_name, storage)
zip_temp_path = create_zip_archive(temp_files, dir_name, storage)
with open(zip_temp_path, "rb") as zip_file:
storage.save_file(zip_file, zip_path)
task = ingest.delay(
@@ -584,6 +585,8 @@ class UploadFile(Resource):
job_name,
zip_filename,
user,
dir_name,
safe_user,
)
finally:
# Clean up temporary files
@@ -604,7 +607,7 @@ class UploadFile(Resource):
# For single file
file = files[0]
filename = secure_filename(file.filename)
filename = safe_filename(file.filename)
file_path = f"{base_path}/{filename}"
storage.save_file(file, file_path)
@@ -631,6 +634,8 @@ class UploadFile(Resource):
job_name,
filename, # Corrected variable for single-file case
user,
dir_name,
safe_user,
)
except Exception as err:
current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
@@ -3457,7 +3462,7 @@ class StoreAttachment(Resource):
jsonify({"status": "error", "message": "Missing file"}),
400,
)
user = secure_filename(decoded_token.get("sub"))
user = safe_filename(decoded_token.get("sub"))
try:
attachment_id = ObjectId()

View File

@@ -11,8 +11,8 @@ from application.worker import (
@celery.task(bind=True)
def ingest(self, directory, formats, name_job, filename, user):
resp = ingest_worker(self, directory, formats, name_job, filename, user)
def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir):
resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir)
return resp

View File

@@ -1,8 +1,11 @@
import hashlib
import os
import re
import uuid
import tiktoken
from flask import jsonify, make_response
from werkzeug.utils import secure_filename
_encoding = None
@@ -15,6 +18,31 @@ def get_encoding():
return _encoding
def safe_filename(filename):
"""
Creates a safe filename that preserves the original extension.
Uses secure_filename, but ensures a proper filename is returned even with non-Latin characters.
Args:
filename (str): The original filename
Returns:
str: A safe filename that can be used for storage
"""
if not filename:
return str(uuid.uuid4())
_, extension = os.path.splitext(filename)
safe_name = secure_filename(filename)
# If secure_filename returns just the extension or an empty string
if not safe_name or safe_name == extension.lstrip('.'):
return f"{str(uuid.uuid4())}{extension}"
return safe_name
def num_tokens_from_string(string: str) -> int:
encoding = get_encoding()
if isinstance(string, str):

View File

@@ -194,7 +194,7 @@ def run_agent_logic(agent_config, input_data):
# Define the main function for ingesting and processing documents.
def ingest_worker(
self, directory, formats, name_job, filename, user, retriever="classic"
self, directory, formats, job_name, filename, user, dir_name=None, user_dir=None, retriever="classic"
):
"""
Ingest and process documents.
@@ -203,9 +203,11 @@ def ingest_worker(
self: Reference to the instance of the task.
directory (str): Specifies the directory for ingesting ('inputs' or 'temp').
formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]).
name_job (str): Name of the job for this ingestion task.
job_name (str): Name of the job for this ingestion task (original, unsanitized).
filename (str): Name of the file to be ingested.
user (str): Identifier for the user initiating the ingestion.
user (str): Identifier for the user initiating the ingestion (original, unsanitized).
dir_name (str, optional): Sanitized directory name for filesystem operations.
user_dir (str, optional): Sanitized user ID for filesystem operations.
retriever (str): Type of retriever to use for processing the documents.
Returns:
@@ -216,13 +218,13 @@ def ingest_worker(
limit = None
exclude = True
sample = False
storage = StorageCreator.get_storage()
full_path = os.path.join(directory, user, name_job)
full_path = os.path.join(directory, user_dir, dir_name)
source_file_path = os.path.join(full_path, filename)
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name})
# Create temporary working directory
with tempfile.TemporaryDirectory() as temp_dir:
@@ -283,13 +285,14 @@ def ingest_worker(
for i in range(min(5, len(raw_docs))):
logging.info(f"Sample document {i}: {raw_docs[i]}")
file_data = {
"name": name_job,
"name": job_name, # Use original job_name
"file": filename,
"user": user,
"user": user, # Use original user
"tokens": tokens,
"retriever": retriever,
"id": str(id),
"type": "local",
"original_file_path": source_file_path,
}
upload_index(vector_store_path, file_data)
@@ -301,9 +304,9 @@ def ingest_worker(
return {
"directory": directory,
"formats": formats,
"name_job": name_job,
"name_job": job_name, # Use original job_name
"filename": filename,
"user": user,
"user": user, # Use original user
"limited": False,
}