From 142477ab9b2905669a752591e44eee25e2e97233 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 11 Jun 2025 21:03:38 +0530 Subject: [PATCH] (feat:safe_filename) handles case of non-ascii char --- application/utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/application/utils.py b/application/utils.py index 6d47d31a..548c8828 100644 --- a/application/utils.py +++ b/application/utils.py @@ -1,8 +1,11 @@ import hashlib +import os import re +import uuid import tiktoken from flask import jsonify, make_response +from werkzeug.utils import secure_filename _encoding = None @@ -15,6 +18,31 @@ def get_encoding(): return _encoding +def safe_filename(filename): + """ + Creates a safe filename that preserves the original extension. + Uses secure_filename, but ensures a proper filename is returned even with non-Latin characters. + + Args: + filename (str): The original filename + + Returns: + str: A safe filename that can be used for storage + """ + if not filename: + return str(uuid.uuid4()) + + _, extension = os.path.splitext(filename) + + safe_name = secure_filename(filename) + + # If secure_filename returns just the extension or an empty string + if not safe_name or safe_name == extension.lstrip('.'): + return f"{str(uuid.uuid4())}{extension}" + + return safe_name + + def num_tokens_from_string(string: str) -> int: encoding = get_encoding() if isinstance(string, str):