import hashlib import os import re import uuid import tiktoken from flask import jsonify, make_response from werkzeug.utils import secure_filename from application.core.model_utils import get_token_limit from application.core.settings import settings _encoding = None def get_encoding(): global _encoding if _encoding is None: _encoding = tiktoken.get_encoding("cl100k_base") return _encoding def get_gpt_model() -> str: """Get GPT model based on provider""" model_map = { "openai": "gpt-4o-mini", "anthropic": "claude-2", "groq": "llama3-8b-8192", "novita": "deepseek/deepseek-r1", } return settings.LLM_NAME or model_map.get(settings.LLM_PROVIDER, "") def safe_filename(filename): """Create safe filename, preserving extension. Handles non-Latin characters.""" if not filename: return str(uuid.uuid4()) _, extension = os.path.splitext(filename) safe_name = secure_filename(filename) # If secure_filename returns just the extension or an empty string if not safe_name or safe_name == extension.lstrip("."): return f"{str(uuid.uuid4())}{extension}" return safe_name def num_tokens_from_string(string: str) -> int: encoding = get_encoding() if isinstance(string, str): num_tokens = len(encoding.encode(string)) return num_tokens else: return 0 def num_tokens_from_object_or_list(thing): if isinstance(thing, list): return sum([num_tokens_from_object_or_list(x) for x in thing]) elif isinstance(thing, dict): return sum([num_tokens_from_object_or_list(x) for x in thing.values()]) elif isinstance(thing, str): return num_tokens_from_string(thing) else: return 0 def count_tokens_docs(docs): docs_content = "" for doc in docs: docs_content += doc.page_content tokens = num_tokens_from_string(docs_content) return tokens def calculate_doc_token_budget( model_id: str = "gpt-4o", history_token_limit: int = 2000 ) -> int: total_context = get_token_limit(model_id) reserved = sum(settings.RESERVED_TOKENS.values()) doc_budget = total_context - history_token_limit - reserved return max(doc_budget, 1000) def get_missing_fields(data, required_fields): """Check for missing required fields. Returns list of missing field names.""" return [field for field in required_fields if field not in data] def check_required_fields(data, required_fields): """Validate required fields. Returns Flask 400 response if validation fails, None otherwise.""" missing_fields = get_missing_fields(data, required_fields) if missing_fields: return make_response( jsonify( { "success": False, "message": f"Missing required fields: {', '.join(missing_fields)}", } ), 400, ) return None def get_field_validation_errors(data, required_fields): """Check for missing and empty fields. Returns dict with 'missing_fields' and 'empty_fields', or None.""" missing_fields = [] empty_fields = [] for field in required_fields: if field not in data: missing_fields.append(field) elif not data[field]: empty_fields.append(field) if missing_fields or empty_fields: return {"missing_fields": missing_fields, "empty_fields": empty_fields} return None def validate_required_fields(data, required_fields): """Validate required fields (must exist and be non-empty). Returns Flask 400 response if validation fails, None otherwise.""" errors_dict = get_field_validation_errors(data, required_fields) if errors_dict: errors = [] if errors_dict["missing_fields"]: errors.append( f"Missing required fields: {', '.join(errors_dict['missing_fields'])}" ) if errors_dict["empty_fields"]: errors.append( f"Empty values in required fields: {', '.join(errors_dict['empty_fields'])}" ) return make_response( jsonify({"success": False, "message": " | ".join(errors)}), 400 ) return None def get_hash(data): return hashlib.md5(data.encode(), usedforsecurity=False).hexdigest() def limit_chat_history(history, max_token_limit=None, model_id="docsgpt-local"): """Limit chat history to fit within token limit.""" model_token_limit = get_token_limit(model_id) max_token_limit = ( max_token_limit if max_token_limit and max_token_limit < model_token_limit else model_token_limit ) if not history: return [] trimmed_history = [] tokens_current_history = 0 for message in reversed(history): tokens_batch = 0 if "prompt" in message and "response" in message: tokens_batch += num_tokens_from_string(message["prompt"]) tokens_batch += num_tokens_from_string(message["response"]) if "tool_calls" in message: for tool_call in message["tool_calls"]: tool_call_string = f"Tool: {tool_call.get('tool_name')} | Action: {tool_call.get('action_name')} | Args: {tool_call.get('arguments')} | Response: {tool_call.get('result')}" tokens_batch += num_tokens_from_string(tool_call_string) if tokens_current_history + tokens_batch < max_token_limit: tokens_current_history += tokens_batch trimmed_history.insert(0, message) else: break return trimmed_history def validate_function_name(function_name): """Validate function name matches allowed pattern (alphanumeric, underscore, hyphen).""" if not re.match(r"^[a-zA-Z0-9_-]+$", function_name): return False return True def generate_image_url(image_path): if isinstance(image_path, str) and ( image_path.startswith("http://") or image_path.startswith("https://") ): return image_path strategy = getattr(settings, "URL_STRATEGY", "backend") if strategy == "s3": bucket_name = getattr(settings, "S3_BUCKET_NAME", "docsgpt-test-bucket") region_name = getattr(settings, "SAGEMAKER_REGION", "eu-central-1") return f"https://{bucket_name}.s3.{region_name}.amazonaws.com/{image_path}" else: base_url = getattr(settings, "API_URL", "http://localhost:7091") return f"{base_url}/api/images/{image_path}" def calculate_compression_threshold( model_id: str, threshold_percentage: float = 0.8 ) -> int: """ Calculate token threshold for triggering compression. Args: model_id: Model identifier threshold_percentage: Percentage of context window (default 80%) Returns: Token count threshold """ total_context = get_token_limit(model_id) threshold = int(total_context * threshold_percentage) return threshold def clean_text_for_tts(text: str) -> str: """ clean text for Text-to-Speech processing. """ # Handle code blocks and links text = re.sub(r"```mermaid[\s\S]*?```", " flowchart, ", text) ## ```mermaid...``` text = re.sub(r"```[\s\S]*?```", " code block, ", text) ## ```code``` text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) ## [text](url) text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", "", text) ## ![alt](url) # Remove markdown formatting text = re.sub(r"`([^`]+)`", r"\1", text) ## `code` text = re.sub(r"\{([^}]*)\}", r" \1 ", text) ## {text} text = re.sub(r"[{}]", " ", text) ## unmatched {} text = re.sub(r"\[([^\]]+)\]", r" \1 ", text) ## [text] text = re.sub(r"[\[\]]", " ", text) ## unmatched [] text = re.sub(r"(\*\*|__)(.*?)\1", r"\2", text) ## **bold** __bold__ text = re.sub(r"(\*|_)(.*?)\1", r"\2", text) ## *italic* _italic_ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) ## # headers text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE) ## > blockquotes text = re.sub(r"^[\s]*[-\*\+]\s+", "", text, flags=re.MULTILINE) ## - * + lists text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE) ## 1. numbered lists text = re.sub( r"^[\*\-_]{3,}\s*$", "", text, flags=re.MULTILINE ) ## --- *** ___ rules text = re.sub(r"<[^>]*>", "", text) ## tags # Remove non-ASCII (emojis, special Unicode) text = re.sub(r"[^\x20-\x7E\n\r\t]", "", text) # Replace special sequences text = re.sub(r"-->", ", ", text) ## --> text = re.sub(r"<--", ", ", text) ## <-- text = re.sub(r"=>", ", ", text) ## => text = re.sub(r"::", " ", text) ## :: # Normalize whitespace text = re.sub(r"\s+", " ", text) text = text.strip() return text