From 142ed75468cf4d251f2fc8c4a9f20c4617d15406 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 16 Apr 2025 03:31:06 +0530 Subject: [PATCH 01/39] ((feat:fs_abstact) base --- application/storage/base.py | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 application/storage/base.py diff --git a/application/storage/base.py b/application/storage/base.py new file mode 100644 index 00000000..c16eb600 --- /dev/null +++ b/application/storage/base.py @@ -0,0 +1,73 @@ +"""Base storage class for file system abstraction.""" +from abc import ABC, abstractmethod +from typing import BinaryIO, List + + +class BaseStorage(ABC): + """Abstract base class for storage implementations.""" + + @abstractmethod + def save_file(self, file_data: BinaryIO, path: str) -> str: + """ + Save a file to storage. + + Args: + file_data: File-like object containing the data + path: Path where the file should be stored + + Returns: + str: The complete path where the file was saved + """ + pass + + @abstractmethod + def get_file(self, path: str) -> BinaryIO: + """ + Retrieve a file from storage. + + Args: + path: Path to the file + + Returns: + BinaryIO: File-like object containing the file data + """ + pass + + @abstractmethod + def delete_file(self, path: str) -> bool: + """ + Delete a file from storage. + + Args: + path: Path to the file + + Returns: + bool: True if deletion was successful + """ + pass + + @abstractmethod + def file_exists(self, path: str) -> bool: + """ + Check if a file exists. + + Args: + path: Path to the file + + Returns: + bool: True if the file exists + """ + pass + + @abstractmethod + def list_files(self, directory: str) -> List[str]: + """ + List all files in a directory. + + Args: + directory: Directory path to list + + Returns: + List[str]: List of file paths + """ + pass From 89b2937b110509d7b472676a738179b009287821 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 16 Apr 2025 03:31:28 +0530 Subject: [PATCH 02/39] ((feat:fs_abstact) local --- application/storage/local.py | 85 ++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 application/storage/local.py diff --git a/application/storage/local.py b/application/storage/local.py new file mode 100644 index 00000000..82707007 --- /dev/null +++ b/application/storage/local.py @@ -0,0 +1,85 @@ +"""Local file system implementation.""" +import os +import shutil +from typing import BinaryIO, List + +from application.core.settings import settings +from application.storage.base import BaseStorage + + +class LocalStorage(BaseStorage): + """Local file system storage implementation.""" + + def __init__(self, base_dir: str = None): + """ + Initialize local storage. + + Args: + base_dir: Base directory for all operations. If None, uses current directory. + """ + self.base_dir = base_dir or os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + + def _get_full_path(self, path: str) -> str: + """Get absolute path by combining base_dir and path.""" + if os.path.isabs(path): + return path + return os.path.join(self.base_dir, path) + + def save_file(self, file_data: BinaryIO, path: str) -> str: + """Save a file to local storage.""" + full_path = self._get_full_path(path) + + # Ensure directory exists + os.makedirs(os.path.dirname(full_path), exist_ok=True) + + # Write file + if hasattr(file_data, 'save'): + # Handle Flask's FileStorage objects + file_data.save(full_path) + else: + # Handle regular file-like objects + with open(full_path, 'wb') as f: + shutil.copyfileobj(file_data, f) + + return path + + def get_file(self, path: str) -> BinaryIO: + """Get a file from local storage.""" + full_path = self._get_full_path(path) + + if not os.path.exists(full_path): + raise FileNotFoundError(f"File not found: {full_path}") + + return open(full_path, 'rb') + + def delete_file(self, path: str) -> bool: + """Delete a file from local storage.""" + full_path = self._get_full_path(path) + + if not os.path.exists(full_path): + return False + + os.remove(full_path) + return True + + def file_exists(self, path: str) -> bool: + """Check if a file exists in local storage.""" + full_path = self._get_full_path(path) + return os.path.exists(full_path) + + def list_files(self, directory: str) -> List[str]: + """List all files in a directory in local storage.""" + full_path = self._get_full_path(directory) + + if not os.path.exists(full_path): + return [] + + result = [] + for root, _, files in os.walk(full_path): + for file in files: + rel_path = os.path.relpath(os.path.join(root, file), self.base_dir) + result.append(rel_path) + + return result From e567d8895128e46eba3c3df0fddd9e050985006b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 16 Apr 2025 03:31:42 +0530 Subject: [PATCH 03/39] ((feat:fs_abstact) s3 --- application/storage/s3.py | 81 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 application/storage/s3.py diff --git a/application/storage/s3.py b/application/storage/s3.py new file mode 100644 index 00000000..f9d38d09 --- /dev/null +++ b/application/storage/s3.py @@ -0,0 +1,81 @@ +"""S3 storage implementation.""" +import io +from typing import BinaryIO, List + +import boto3 +from botocore.exceptions import ClientError + +from application.storage.base import BaseStorage + + +class S3Storage(BaseStorage): + """AWS S3 storage implementation.""" + + def __init__(self, bucket_name: str, aws_access_key_id=None, + aws_secret_access_key=None, region_name=None): + """ + Initialize S3 storage. + + Args: + bucket_name: S3 bucket name + aws_access_key_id: AWS access key ID (optional if using IAM roles) + aws_secret_access_key: AWS secret access key (optional if using IAM roles) + region_name: AWS region name (optional) + """ + self.bucket_name = bucket_name + + # Initialize S3 client + self.s3 = boto3.client( + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=region_name + ) + + def save_file(self, file_data: BinaryIO, path: str) -> str: + """Save a file to S3 storage.""" + self.s3.upload_fileobj(file_data, self.bucket_name, path) + return path + + def get_file(self, path: str) -> BinaryIO: + """Get a file from S3 storage.""" + if not self.file_exists(path): + raise FileNotFoundError(f"File not found: {path}") + + file_obj = io.BytesIO() + self.s3.download_fileobj(self.bucket_name, path, file_obj) + file_obj.seek(0) + return file_obj + + def delete_file(self, path: str) -> bool: + """Delete a file from S3 storage.""" + try: + self.s3.delete_object(Bucket=self.bucket_name, Key=path) + return True + except ClientError: + return False + + def file_exists(self, path: str) -> bool: + """Check if a file exists in S3 storage.""" + try: + self.s3.head_object(Bucket=self.bucket_name, Key=path) + return True + except ClientError: + return False + + def list_files(self, directory: str) -> List[str]: + """List all files in a directory in S3 storage.""" + # Ensure directory ends with a slash if it's not empty + if directory and not directory.endswith('/'): + directory += '/' + + result = [] + paginator = self.s3.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=self.bucket_name, Prefix=directory) + + for page in pages: + if 'Contents' in page: + for obj in page['Contents']: + result.append(obj['Key']) + + return result \ No newline at end of file From 377e33c148c664d21e9b3c580cce1909352aca5e Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 16 Apr 2025 03:36:45 +0530 Subject: [PATCH 04/39] (feat:file_abstract) process files method --- application/storage/base.py | 20 +++++++++++++++++- application/storage/local.py | 24 ++++++++++++++++++++-- application/storage/s3.py | 39 +++++++++++++++++++++++++++++++++--- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/application/storage/base.py b/application/storage/base.py index c16eb600..cb205091 100644 --- a/application/storage/base.py +++ b/application/storage/base.py @@ -1,6 +1,6 @@ """Base storage class for file system abstraction.""" from abc import ABC, abstractmethod -from typing import BinaryIO, List +from typing import BinaryIO, List, Optional, Callable class BaseStorage(ABC): @@ -33,6 +33,24 @@ class BaseStorage(ABC): """ pass + @abstractmethod + def process_file(self, path: str, processor_func: Callable, **kwargs): + """ + Process a file using the provided processor function. + + This method handles the details of retrieving the file and providing + it to the processor function in an appropriate way based on the storage type. + + Args: + path: Path to the file + processor_func: Function that processes the file + **kwargs: Additional arguments to pass to the processor function + + Returns: + The result of the processor function + """ + pass + @abstractmethod def delete_file(self, path: str) -> bool: """ diff --git a/application/storage/local.py b/application/storage/local.py index 82707007..91c5c264 100644 --- a/application/storage/local.py +++ b/application/storage/local.py @@ -1,9 +1,8 @@ """Local file system implementation.""" import os import shutil -from typing import BinaryIO, List +from typing import BinaryIO, List, Callable -from application.core.settings import settings from application.storage.base import BaseStorage @@ -83,3 +82,24 @@ class LocalStorage(BaseStorage): result.append(rel_path) return result + + def process_file(self, path: str, processor_func: Callable, **kwargs): + """ + Process a file using the provided processor function. + + For local storage, we can directly pass the full path to the processor. + + Args: + path: Path to the file + processor_func: Function that processes the file + **kwargs: Additional arguments to pass to the processor function + + Returns: + The result of the processor function + """ + full_path = self._get_full_path(path) + + if not os.path.exists(full_path): + raise FileNotFoundError(f"File not found: {full_path}") + + return processor_func(file_path=full_path, **kwargs) diff --git a/application/storage/s3.py b/application/storage/s3.py index f9d38d09..cdec6887 100644 --- a/application/storage/s3.py +++ b/application/storage/s3.py @@ -1,6 +1,6 @@ """S3 storage implementation.""" import io -from typing import BinaryIO, List +from typing import BinaryIO, List, Callable import boto3 from botocore.exceptions import ClientError @@ -24,7 +24,6 @@ class S3Storage(BaseStorage): """ self.bucket_name = bucket_name - # Initialize S3 client self.s3 = boto3.client( 's3', aws_access_key_id=aws_access_key_id, @@ -78,4 +77,38 @@ class S3Storage(BaseStorage): for obj in page['Contents']: result.append(obj['Key']) - return result \ No newline at end of file + return result + + def process_file(self, path: str, processor_func: Callable, **kwargs): + """ + Process a file using the provided processor function. + + For S3 storage, we need to download the file to a temporary location first. + + Args: + path: Path to the file + processor_func: Function that processes the file + **kwargs: Additional arguments to pass to the processor function + + Returns: + The result of the processor function + """ + import tempfile + import os + + if not self.file_exists(path): + raise FileNotFoundError(f"File not found: {path}") + + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + self.s3.download_fileobj(self.bucket_name, path, temp_file) + temp_path = temp_file.name + + try: + result = processor_func(file_path=temp_path, **kwargs) + return result + finally: + try: + os.unlink(temp_path) + except Exception as e: + import logging + logging.warning(f"Failed to delete temporary file: {e}") From 0a0e16547e95a084cfbac4f0bcaed1f4a85fc613 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 02:35:45 +0530 Subject: [PATCH 05/39] (feat:fs_abstract) attachment uploads --- application/api/user/routes.py | 23 ++------ application/api/user/tasks.py | 4 +- application/worker.py | 99 +++++++++++++++++++--------------- 3 files changed, 63 insertions(+), 63 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 91b028d5..98af5343 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -2494,7 +2494,6 @@ class StoreAttachment(Resource): if not decoded_token: return make_response(jsonify({"success": False}), 401) - # Get single file instead of list file = request.files.get("file") if not file or file.filename == "": @@ -2508,29 +2507,18 @@ class StoreAttachment(Resource): try: attachment_id = ObjectId() original_filename = secure_filename(file.filename) + relative_path = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{str(attachment_id)}/{original_filename}" - save_dir = os.path.join( - current_dir, - settings.UPLOAD_FOLDER, - user, - "attachments", - str(attachment_id) - ) - os.makedirs(save_dir, exist_ok=True) + file_content = file.read() - file_path = os.path.join(save_dir, original_filename) - - - file.save(file_path) file_info = { "filename": original_filename, - "attachment_id": str(attachment_id) + "attachment_id": str(attachment_id), + "path": relative_path, + "file_content": file_content } - current_app.logger.info(f"Saved file: {file_path}") - # Start async task to process single file task = store_attachment.delay( - save_dir, file_info, user ) @@ -2543,7 +2531,6 @@ class StoreAttachment(Resource): }), 200 ) - except Exception as err: current_app.logger.error(f"Error storing attachment: {err}") return make_response(jsonify({"success": False, "error": str(err)}), 400) diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index 24cff3c6..c9d4d39d 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -23,8 +23,8 @@ def schedule_syncs(self, frequency): @celery.task(bind=True) -def store_attachment(self, directory, saved_files, user): - resp = attachment_worker(self, directory, saved_files, user) +def store_attachment(self, file_info, user): + resp = attachment_worker(self, file_info, user) return resp diff --git a/application/worker.py b/application/worker.py index bbd422ac..d561a53f 100755 --- a/application/worker.py +++ b/application/worker.py @@ -3,15 +3,21 @@ import os import shutil import string import zipfile +import io +import datetime +import mimetypes +import requests + from collections import Counter from urllib.parse import urljoin -import requests +from application.storage.storage_creator import StorageCreator +from application.utils import num_tokens_from_string +from application.core.settings import settings +from application.parser.file.bulk import SimpleDirectoryReader from bson.objectid import ObjectId from application.core.mongo_db import MongoDB -from application.core.settings import settings -from application.parser.file.bulk import SimpleDirectoryReader from application.parser.embedding_pipeline import embed_and_store_documents from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document @@ -313,23 +319,11 @@ def sync_worker(self, frequency): for key in ["total_sync_count", "sync_success", "sync_failure"] } -def attachment_worker(self, directory, file_info, user): + +def attachment_worker(self, file_info, user): """ Process and store a single attachment without vectorization. - - Args: - self: Reference to the instance of the task. - directory (str): Base directory for storing files. - file_info (dict): Dictionary with folder and filename info. - user (str): User identifier. - - Returns: - dict: Information about processed attachment. """ - import datetime - import os - import mimetypes - from application.utils import num_tokens_from_string mongo = MongoDB.get_client() db = mongo["docsgpt"] @@ -337,60 +331,79 @@ def attachment_worker(self, directory, file_info, user): filename = file_info["filename"] attachment_id = file_info["attachment_id"] - - logging.info(f"Processing attachment: {attachment_id}/{filename}", extra={"user": user}) - - self.update_state(state="PROGRESS", meta={"current": 10}) - - file_path = os.path.join(directory, filename) - - if not os.path.exists(file_path): - logging.warning(f"File not found: {file_path}", extra={"user": user}) - raise FileNotFoundError(f"File not found: {file_path}") + relative_path = file_info["path"] + file_content = file_info["file_content"] try: - reader = SimpleDirectoryReader( - input_files=[file_path] - ) - documents = reader.load_data() + self.update_state(state="PROGRESS", meta={"current": 10}) - self.update_state(state="PROGRESS", meta={"current": 50}) + storage_type = getattr(settings, "STORAGE_TYPE", "local") + storage = StorageCreator.create_storage(storage_type) - if documents: + self.update_state(state="PROGRESS", meta={"current": 30, "status": "Saving file"}) + file_obj = io.BytesIO(file_content) + storage.save_file(file_obj, relative_path) + + def process_document(file_path, **kwargs): + self.update_state(state="PROGRESS", meta={"current": 50, "status": "Processing content"}) + + reader = SimpleDirectoryReader( + input_files=[file_path], + exclude_hidden=True, + errors="ignore" + ) + documents = reader.load_data() + + if not documents: + logging.warning(f"No content extracted from file: {filename}") + raise ValueError(f"Failed to extract content from file: {filename}") + content = documents[0].text token_count = num_tokens_from_string(content) - file_path_relative = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{attachment_id}/{filename}" + mime_type = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' + metadata = { + "storage_type": storage_type, + } + + if storage_type == "s3": + metadata.update({ + "bucket_name": getattr(storage, "bucket_name", "docsgpt-test-bucket"), + "uri": f"s3://{storage.bucket_name}/{relative_path}", + "region": getattr(settings, "SAGEMAKER_REGION", "us-east-1") + }) + + self.update_state(state="PROGRESS", meta={"current": 80, "status": "Storing in database"}) doc_id = ObjectId(attachment_id) attachments_collection.insert_one({ "_id": doc_id, "user": user, - "path": file_path_relative, + "path": relative_path, "content": content, "token_count": token_count, "mime_type": mime_type, "date": datetime.datetime.now(), + "metadata": metadata }) logging.info(f"Stored attachment with ID: {attachment_id}", extra={"user": user}) - self.update_state(state="PROGRESS", meta={"current": 100}) + self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"}) return { "filename": filename, - "path": file_path_relative, + "path": relative_path, "token_count": token_count, "attachment_id": attachment_id, - "mime_type": mime_type + "mime_type": mime_type, + "metadata": metadata } - else: - logging.warning("No content was extracted from the file", - extra={"user": user}) - raise ValueError("No content was extracted from the file") + + return storage.process_file(relative_path, process_document) + except Exception as e: logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True) raise From 9454150f7d125d179bcfb97629f0cb16ebdf6932 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 02:36:55 +0530 Subject: [PATCH 06/39] (fix:s3) processor func --- application/storage/base.py | 2 +- application/storage/s3.py | 42 ++++++++++++++++++------------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/application/storage/base.py b/application/storage/base.py index cb205091..88fed0c6 100644 --- a/application/storage/base.py +++ b/application/storage/base.py @@ -1,6 +1,6 @@ """Base storage class for file system abstraction.""" from abc import ABC, abstractmethod -from typing import BinaryIO, List, Optional, Callable +from typing import BinaryIO, List, Callable class BaseStorage(ABC): diff --git a/application/storage/s3.py b/application/storage/s3.py index cdec6887..e02a2a5a 100644 --- a/application/storage/s3.py +++ b/application/storage/s3.py @@ -1,28 +1,31 @@ """S3 storage implementation.""" import io from typing import BinaryIO, List, Callable +import os import boto3 from botocore.exceptions import ClientError from application.storage.base import BaseStorage +from application.core.settings import settings class S3Storage(BaseStorage): """AWS S3 storage implementation.""" - def __init__(self, bucket_name: str, aws_access_key_id=None, - aws_secret_access_key=None, region_name=None): + def __init__(self, bucket_name=None): """ Initialize S3 storage. Args: - bucket_name: S3 bucket name - aws_access_key_id: AWS access key ID (optional if using IAM roles) - aws_secret_access_key: AWS secret access key (optional if using IAM roles) - region_name: AWS region name (optional) + bucket_name: S3 bucket name (optional, defaults to settings) """ - self.bucket_name = bucket_name + self.bucket_name = bucket_name or getattr(settings, "S3_BUCKET_NAME", "docsgpt-test-bucket") + + # Get credentials from settings + aws_access_key_id = getattr(settings, "SAGEMAKER_ACCESS_KEY", None) + aws_secret_access_key = getattr(settings, "SAGEMAKER_SECRET_KEY", None) + region_name = getattr(settings, "SAGEMAKER_REGION", None) self.s3 = boto3.client( 's3', @@ -83,8 +86,6 @@ class S3Storage(BaseStorage): """ Process a file using the provided processor function. - For S3 storage, we need to download the file to a temporary location first. - Args: path: Path to the file processor_func: Function that processes the file @@ -94,21 +95,18 @@ class S3Storage(BaseStorage): The result of the processor function """ import tempfile - import os + import logging if not self.file_exists(path): - raise FileNotFoundError(f"File not found: {path}") + raise FileNotFoundError(f"File not found in S3: {path}") - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - self.s3.download_fileobj(self.bucket_name, path, temp_file) - temp_path = temp_file.name - - try: - result = processor_func(file_path=temp_path, **kwargs) - return result - finally: + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(path)[1], delete=True) as temp_file: try: - os.unlink(temp_path) + # Download the file from S3 to the temporary file + self.s3.download_fileobj(self.bucket_name, path, temp_file) + temp_file.flush() + result = processor_func(file_path=temp_file.name, **kwargs) + return result except Exception as e: - import logging - logging.warning(f"Failed to delete temporary file: {e}") + logging.error(f"Error processing S3 file {path}: {e}", exc_info=True) + raise From 68e4cf4d1415fdb66d9dbaf93c30451115ea18df Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 02:40:53 +0530 Subject: [PATCH 07/39] (feat:fsabstract) add factory class --- application/storage/storage_creator.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 application/storage/storage_creator.py diff --git a/application/storage/storage_creator.py b/application/storage/storage_creator.py new file mode 100644 index 00000000..dcf64983 --- /dev/null +++ b/application/storage/storage_creator.py @@ -0,0 +1,21 @@ +"""Storage factory for creating different storage implementations.""" +from typing import Dict, Type + +from application.storage.base import BaseStorage +from application.storage.local import LocalStorage +from application.storage.s3 import S3Storage + + +class StorageCreator: + storages: Dict[str, Type[BaseStorage]] = { + "local": LocalStorage, + "s3": S3Storage, + } + + @classmethod + def create_storage(cls, type_name: str, *args, **kwargs) -> BaseStorage: + storage_class = cls.storages.get(type_name.lower()) + if not storage_class: + raise ValueError(f"No storage implementation found for type {type_name}") + + return storage_class(*args, **kwargs) From 0d3e6157cd487fcba1e59a63d496d40bf404a457 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 16:23:01 +0530 Subject: [PATCH 08/39] (feat:attachmentUpload) parse content before upload --- application/worker.py | 65 ++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/application/worker.py b/application/worker.py index d561a53f..b5caa23e 100755 --- a/application/worker.py +++ b/application/worker.py @@ -7,11 +7,12 @@ import io import datetime import mimetypes import requests +import tempfile from collections import Counter from urllib.parse import urljoin -from application.storage.storage_creator import StorageCreator +from application.storage.storage_creator import StorageCreator from application.utils import num_tokens_from_string from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader @@ -209,7 +210,7 @@ def remote_worker( sync_frequency="never", operation_mode="upload", doc_id=None, -): +): full_path = os.path.join(directory, user, name_job) if not os.path.exists(full_path): os.makedirs(full_path) @@ -324,58 +325,48 @@ def attachment_worker(self, file_info, user): """ Process and store a single attachment without vectorization. """ - + mongo = MongoDB.get_client() db = mongo["docsgpt"] attachments_collection = db["attachments"] - + filename = file_info["filename"] attachment_id = file_info["attachment_id"] relative_path = file_info["path"] file_content = file_info["file_content"] - + try: self.update_state(state="PROGRESS", meta={"current": 10}) - storage_type = getattr(settings, "STORAGE_TYPE", "local") storage = StorageCreator.create_storage(storage_type) - - self.update_state(state="PROGRESS", meta={"current": 30, "status": "Saving file"}) - file_obj = io.BytesIO(file_content) - storage.save_file(file_obj, relative_path) - - def process_document(file_path, **kwargs): - self.update_state(state="PROGRESS", meta={"current": 50, "status": "Processing content"}) - + self.update_state(state="PROGRESS", meta={"current": 30, "status": "Processing content"}) + + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1]) as temp_file: + temp_file.write(file_content) + temp_file.flush() reader = SimpleDirectoryReader( - input_files=[file_path], + input_files=[temp_file.name], exclude_hidden=True, errors="ignore" ) documents = reader.load_data() - + if not documents: logging.warning(f"No content extracted from file: {filename}") raise ValueError(f"Failed to extract content from file: {filename}") - + content = documents[0].text token_count = num_tokens_from_string(content) - + + self.update_state(state="PROGRESS", meta={"current": 60, "status": "Saving file"}) + file_obj = io.BytesIO(file_content) + + metadata = storage.save_file(file_obj, relative_path) + mime_type = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - - metadata = { - "storage_type": storage_type, - } - - if storage_type == "s3": - metadata.update({ - "bucket_name": getattr(storage, "bucket_name", "docsgpt-test-bucket"), - "uri": f"s3://{storage.bucket_name}/{relative_path}", - "region": getattr(settings, "SAGEMAKER_REGION", "us-east-1") - }) - + self.update_state(state="PROGRESS", meta={"current": 80, "status": "Storing in database"}) - + doc_id = ObjectId(attachment_id) attachments_collection.insert_one({ "_id": doc_id, @@ -387,12 +378,12 @@ def attachment_worker(self, file_info, user): "date": datetime.datetime.now(), "metadata": metadata }) - - logging.info(f"Stored attachment with ID: {attachment_id}", + + logging.info(f"Stored attachment with ID: {attachment_id}", extra={"user": user}) - + self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"}) - + return { "filename": filename, "path": relative_path, @@ -401,9 +392,7 @@ def attachment_worker(self, file_info, user): "mime_type": mime_type, "metadata": metadata } - - return storage.process_file(relative_path, process_document) - + except Exception as e: logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True) raise From c35d1cecfe41660fa6b95051c650c6f56d233d25 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 16:29:34 +0530 Subject: [PATCH 09/39] (feat:file_abstract) return storage metadata after upload --- application/storage/base.py | 43 ++++++++++++++------------- application/storage/local.py | 56 +++++++++++++++++------------------- application/storage/s3.py | 46 +++++++++++++++++------------ 3 files changed, 77 insertions(+), 68 deletions(-) diff --git a/application/storage/base.py b/application/storage/base.py index 88fed0c6..273e7761 100644 --- a/application/storage/base.py +++ b/application/storage/base.py @@ -7,84 +7,87 @@ class BaseStorage(ABC): """Abstract base class for storage implementations.""" @abstractmethod - def save_file(self, file_data: BinaryIO, path: str) -> str: + def save_file(self, file_data: BinaryIO, path: str) -> dict: """ Save a file to storage. - + Args: file_data: File-like object containing the data path: Path where the file should be stored - + Returns: - str: The complete path where the file was saved + dict: A dictionary containing metadata about the saved file, including: + - 'path': The path where the file was saved + - 'storage_type': The type of storage (e.g., 'local', 's3') + - Other storage-specific metadata (e.g., 'uri', 'bucket_name', etc.) """ pass - + @abstractmethod def get_file(self, path: str) -> BinaryIO: """ Retrieve a file from storage. - + Args: path: Path to the file - + Returns: BinaryIO: File-like object containing the file data """ pass - + @abstractmethod def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. - + This method handles the details of retrieving the file and providing it to the processor function in an appropriate way based on the storage type. - + Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function - + Returns: The result of the processor function """ pass - + @abstractmethod def delete_file(self, path: str) -> bool: """ Delete a file from storage. - + Args: path: Path to the file - + Returns: bool: True if deletion was successful """ pass - + @abstractmethod def file_exists(self, path: str) -> bool: """ Check if a file exists. - + Args: path: Path to the file - + Returns: bool: True if the file exists """ pass - + @abstractmethod def list_files(self, directory: str) -> List[str]: """ List all files in a directory. - + Args: directory: Directory path to list - + Returns: List[str]: List of file paths """ diff --git a/application/storage/local.py b/application/storage/local.py index 91c5c264..db11b63c 100644 --- a/application/storage/local.py +++ b/application/storage/local.py @@ -8,98 +8,96 @@ from application.storage.base import BaseStorage class LocalStorage(BaseStorage): """Local file system storage implementation.""" - + def __init__(self, base_dir: str = None): """ Initialize local storage. - + Args: base_dir: Base directory for all operations. If None, uses current directory. """ self.base_dir = base_dir or os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) - + def _get_full_path(self, path: str) -> str: """Get absolute path by combining base_dir and path.""" if os.path.isabs(path): return path return os.path.join(self.base_dir, path) - - def save_file(self, file_data: BinaryIO, path: str) -> str: + + def save_file(self, file_data: BinaryIO, path: str) -> dict: """Save a file to local storage.""" full_path = self._get_full_path(path) - - # Ensure directory exists + os.makedirs(os.path.dirname(full_path), exist_ok=True) - - # Write file + if hasattr(file_data, 'save'): - # Handle Flask's FileStorage objects file_data.save(full_path) else: - # Handle regular file-like objects with open(full_path, 'wb') as f: shutil.copyfileobj(file_data, f) - - return path - + + return { + 'storage_type': 'local' + } + def get_file(self, path: str) -> BinaryIO: """Get a file from local storage.""" full_path = self._get_full_path(path) - + if not os.path.exists(full_path): raise FileNotFoundError(f"File not found: {full_path}") - + return open(full_path, 'rb') - + def delete_file(self, path: str) -> bool: """Delete a file from local storage.""" full_path = self._get_full_path(path) - + if not os.path.exists(full_path): return False - + os.remove(full_path) return True - + def file_exists(self, path: str) -> bool: """Check if a file exists in local storage.""" full_path = self._get_full_path(path) return os.path.exists(full_path) - + def list_files(self, directory: str) -> List[str]: """List all files in a directory in local storage.""" full_path = self._get_full_path(directory) - + if not os.path.exists(full_path): return [] - + result = [] for root, _, files in os.walk(full_path): for file in files: rel_path = os.path.relpath(os.path.join(root, file), self.base_dir) result.append(rel_path) - + return result def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. - + For local storage, we can directly pass the full path to the processor. - + Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function - + Returns: The result of the processor function """ full_path = self._get_full_path(path) - + if not os.path.exists(full_path): raise FileNotFoundError(f"File not found: {full_path}") - + return processor_func(file_path=full_path, **kwargs) diff --git a/application/storage/s3.py b/application/storage/s3.py index e02a2a5a..e8df210e 100644 --- a/application/storage/s3.py +++ b/application/storage/s3.py @@ -12,43 +12,51 @@ from application.core.settings import settings class S3Storage(BaseStorage): """AWS S3 storage implementation.""" - + def __init__(self, bucket_name=None): """ Initialize S3 storage. - + Args: bucket_name: S3 bucket name (optional, defaults to settings) """ self.bucket_name = bucket_name or getattr(settings, "S3_BUCKET_NAME", "docsgpt-test-bucket") - + # Get credentials from settings aws_access_key_id = getattr(settings, "SAGEMAKER_ACCESS_KEY", None) aws_secret_access_key = getattr(settings, "SAGEMAKER_SECRET_KEY", None) region_name = getattr(settings, "SAGEMAKER_REGION", None) - + self.s3 = boto3.client( 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name ) - - def save_file(self, file_data: BinaryIO, path: str) -> str: + + def save_file(self, file_data: BinaryIO, path: str) -> dict: """Save a file to S3 storage.""" self.s3.upload_fileobj(file_data, self.bucket_name, path) - return path - + + region = getattr(settings, "SAGEMAKER_REGION", None) + + return { + 'storage_type': 's3', + 'bucket_name': self.bucket_name, + 'uri': f's3://{self.bucket_name}/{path}', + 'region': region + } + def get_file(self, path: str) -> BinaryIO: """Get a file from S3 storage.""" if not self.file_exists(path): raise FileNotFoundError(f"File not found: {path}") - + file_obj = io.BytesIO() self.s3.download_fileobj(self.bucket_name, path, file_obj) file_obj.seek(0) return file_obj - + def delete_file(self, path: str) -> bool: """Delete a file from S3 storage.""" try: @@ -56,7 +64,7 @@ class S3Storage(BaseStorage): return True except ClientError: return False - + def file_exists(self, path: str) -> bool: """Check if a file exists in S3 storage.""" try: @@ -64,42 +72,42 @@ class S3Storage(BaseStorage): return True except ClientError: return False - + def list_files(self, directory: str) -> List[str]: """List all files in a directory in S3 storage.""" # Ensure directory ends with a slash if it's not empty if directory and not directory.endswith('/'): directory += '/' - + result = [] paginator = self.s3.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=self.bucket_name, Prefix=directory) - + for page in pages: if 'Contents' in page: for obj in page['Contents']: result.append(obj['Key']) - + return result def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. - + Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function - + Returns: The result of the processor function """ import tempfile import logging - + if not self.file_exists(path): raise FileNotFoundError(f"File not found in S3: {path}") - + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(path)[1], delete=True) as temp_file: try: # Download the file from S3 to the temporary file From 335c21c48ac6785a792ea27fa86d10e6a5d41196 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Apr 2025 16:36:40 +0530 Subject: [PATCH 10/39] (fix:attachment) dont calculate MIME again --- application/agents/llm_handler.py | 55 +++++++++++--------------- application/llm/google_ai.py | 62 +++++++++++++----------------- application/llm/openai.py | 64 +++++++++++++++---------------- 3 files changed, 80 insertions(+), 101 deletions(-) diff --git a/application/agents/llm_handler.py b/application/agents/llm_handler.py index 7fe794f8..bf39f625 100644 --- a/application/agents/llm_handler.py +++ b/application/agents/llm_handler.py @@ -15,95 +15,86 @@ class LLMHandler(ABC): @abstractmethod def handle_response(self, agent, resp, tools_dict, messages, attachments=None, **kwargs): pass - + def prepare_messages_with_attachments(self, agent, messages, attachments=None): """ Prepare messages with attachment content if available. - + Args: agent: The current agent instance. messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content. - + Returns: list: Messages with attachment context added to the system prompt. """ if not attachments: return messages - + logger.info(f"Preparing messages with {len(attachments)} attachments") - + supported_types = agent.llm.get_supported_attachment_types() - + supported_attachments = [] unsupported_attachments = [] - + for attachment in attachments: mime_type = attachment.get('mime_type') - if not mime_type: - import mimetypes - file_path = attachment.get('path') - if file_path: - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' - else: - unsupported_attachments.append(attachment) - continue - if mime_type in supported_types: supported_attachments.append(attachment) else: unsupported_attachments.append(attachment) - + # Process supported attachments with the LLM's custom method prepared_messages = messages if supported_attachments: logger.info(f"Processing {len(supported_attachments)} supported attachments with {agent.llm.__class__.__name__}'s method") prepared_messages = agent.llm.prepare_messages_with_attachments(messages, supported_attachments) - + # Process unsupported attachments with the default method if unsupported_attachments: logger.info(f"Processing {len(unsupported_attachments)} unsupported attachments with default method") prepared_messages = self._append_attachment_content_to_system(prepared_messages, unsupported_attachments) - + return prepared_messages - + def _append_attachment_content_to_system(self, messages, attachments): """ Default method to append attachment content to the system prompt. - + Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content. - + Returns: list: Messages with attachment context added to the system prompt. """ prepared_messages = messages.copy() - + attachment_texts = [] for attachment in attachments: logger.info(f"Adding attachment {attachment.get('id')} to context") if 'content' in attachment: attachment_texts.append(f"Attached file content:\n\n{attachment['content']}") - + if attachment_texts: combined_attachment_text = "\n\n".join(attachment_texts) - + system_found = False for i in range(len(prepared_messages)): if prepared_messages[i].get("role") == "system": prepared_messages[i]["content"] += f"\n\n{combined_attachment_text}" system_found = True break - + if not system_found: prepared_messages.insert(0, {"role": "system", "content": combined_attachment_text}) - + return prepared_messages class OpenAILLMHandler(LLMHandler): def handle_response(self, agent, resp, tools_dict, messages, attachments=None, stream: bool = True): - + messages = self.prepare_messages_with_attachments(agent, messages, attachments) logger.info(f"Messages with attachments: {messages}") if not stream: @@ -167,7 +158,7 @@ class OpenAILLMHandler(LLMHandler): if isinstance(chunk, str) and len(chunk) > 0: yield chunk continue - elif hasattr(chunk, "delta"): + elif hasattr(chunk, "delta"): chunk_delta = chunk.delta if ( @@ -258,7 +249,7 @@ class OpenAILLMHandler(LLMHandler): return resp elif isinstance(chunk, str) and len(chunk) == 0: continue - + logger.info(f"Regenerating with messages: {messages}") resp = agent.llm.gen_stream( model=agent.gpt_model, messages=messages, tools=agent.tools @@ -269,9 +260,9 @@ class OpenAILLMHandler(LLMHandler): class GoogleLLMHandler(LLMHandler): def handle_response(self, agent, resp, tools_dict, messages, attachments=None, stream: bool = True): from google.genai import types - + messages = self.prepare_messages_with_attachments(agent, messages, attachments) - + while True: if not stream: response = agent.llm.gen( diff --git a/application/llm/google_ai.py b/application/llm/google_ai.py index c049eaa2..6d709ec2 100644 --- a/application/llm/google_ai.py +++ b/application/llm/google_ai.py @@ -18,7 +18,7 @@ class GoogleLLM(BaseLLM): def get_supported_attachment_types(self): """ Return a list of MIME types supported by Google Gemini for file uploads. - + Returns: list: List of supported MIME types """ @@ -30,35 +30,35 @@ class GoogleLLM(BaseLLM): 'image/webp', 'image/gif' ] - + def prepare_messages_with_attachments(self, messages, attachments=None): """ Process attachments using Google AI's file API for more efficient handling. - + Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content and metadata. - + Returns: list: Messages formatted with file references for Google AI API. """ if not attachments: return messages - + prepared_messages = messages.copy() - + # Find the user message to attach files to the last one user_message_index = None for i in range(len(prepared_messages) - 1, -1, -1): if prepared_messages[i].get("role") == "user": user_message_index = i break - + if user_message_index is None: user_message = {"role": "user", "content": []} prepared_messages.append(user_message) user_message_index = len(prepared_messages) - 1 - + if isinstance(prepared_messages[user_message_index].get("content"), str): text_content = prepared_messages[user_message_index]["content"] prepared_messages[user_message_index]["content"] = [ @@ -66,15 +66,11 @@ class GoogleLLM(BaseLLM): ] elif not isinstance(prepared_messages[user_message_index].get("content"), list): prepared_messages[user_message_index]["content"] = [] - + files = [] for attachment in attachments: mime_type = attachment.get('mime_type') - if not mime_type: - file_path = attachment.get('path') - if file_path: - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' - + if mime_type in self.get_supported_attachment_types(): try: file_uri = self._upload_file_to_google(attachment) @@ -84,53 +80,49 @@ class GoogleLLM(BaseLLM): logging.error(f"GoogleLLM: Error uploading file: {e}") if 'content' in attachment: prepared_messages[user_message_index]["content"].append({ - "type": "text", + "type": "text", "text": f"[File could not be processed: {attachment.get('path', 'unknown')}]" }) - + if files: logging.info(f"GoogleLLM: Adding {len(files)} files to message") prepared_messages[user_message_index]["content"].append({ "files": files }) - + return prepared_messages def _upload_file_to_google(self, attachment): """ Upload a file to Google AI and return the file URI. - + Args: attachment (dict): Attachment dictionary with path and metadata. - + Returns: str: Google AI file URI for the uploaded file. """ if 'google_file_uri' in attachment: return attachment['google_file_uri'] - + file_path = attachment.get('path') if not file_path: raise ValueError("No file path provided in attachment") - + if not os.path.isabs(file_path): current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) file_path = os.path.join(current_dir, "application", file_path) - + if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - - mime_type = attachment.get('mime_type') - if not mime_type: - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' - + try: response = self.client.files.upload(file=file_path) - + file_uri = response.uri - + from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() db = mongo["docsgpt"] @@ -140,7 +132,7 @@ class GoogleLLM(BaseLLM): {"_id": attachment['_id']}, {"$set": {"google_file_uri": file_uri}} ) - + return file_uri except Exception as e: logging.error(f"Error uploading file to Google AI: {e}") @@ -289,7 +281,7 @@ class GoogleLLM(BaseLLM): if tools: cleaned_tools = self._clean_tools_format(tools) config.tools = cleaned_tools - + # Check if we have both tools and file attachments has_attachments = False for message in messages: @@ -299,16 +291,16 @@ class GoogleLLM(BaseLLM): break if has_attachments: break - + logging.info(f"GoogleLLM: Starting stream generation. Model: {model}, Messages: {json.dumps(messages, default=str)}, Has attachments: {has_attachments}") - + response = client.models.generate_content_stream( model=model, contents=messages, config=config, ) - - + + for chunk in response: if hasattr(chunk, "candidates") and chunk.candidates: for candidate in chunk.candidates: diff --git a/application/llm/openai.py b/application/llm/openai.py index 75bd37e0..b3e179c1 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -149,7 +149,7 @@ class OpenAILLM(BaseLLM): def get_supported_attachment_types(self): """ Return a list of MIME types supported by OpenAI for file uploads. - + Returns: list: List of supported MIME types """ @@ -161,35 +161,35 @@ class OpenAILLM(BaseLLM): 'image/webp', 'image/gif' ] - + def prepare_messages_with_attachments(self, messages, attachments=None): """ Process attachments using OpenAI's file API for more efficient handling. - + Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content and metadata. - + Returns: list: Messages formatted with file references for OpenAI API. """ if not attachments: return messages - + prepared_messages = messages.copy() - + # Find the user message to attach file_id to the last one user_message_index = None for i in range(len(prepared_messages) - 1, -1, -1): if prepared_messages[i].get("role") == "user": user_message_index = i break - + if user_message_index is None: user_message = {"role": "user", "content": []} prepared_messages.append(user_message) user_message_index = len(prepared_messages) - 1 - + if isinstance(prepared_messages[user_message_index].get("content"), str): text_content = prepared_messages[user_message_index]["content"] prepared_messages[user_message_index]["content"] = [ @@ -197,14 +197,10 @@ class OpenAILLM(BaseLLM): ] elif not isinstance(prepared_messages[user_message_index].get("content"), list): prepared_messages[user_message_index]["content"] = [] - + for attachment in attachments: mime_type = attachment.get('mime_type') - if not mime_type: - file_path = attachment.get('path') - if file_path: - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' - + if mime_type and mime_type.startswith('image/'): try: base64_image = self._get_base64_image(attachment) @@ -218,14 +214,14 @@ class OpenAILLM(BaseLLM): logging.error(f"Error processing image attachment: {e}") if 'content' in attachment: prepared_messages[user_message_index]["content"].append({ - "type": "text", + "type": "text", "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]" }) # Handle PDFs using the file API elif mime_type == 'application/pdf': try: file_id = self._upload_file_to_openai(attachment) - + prepared_messages[user_message_index]["content"].append({ "type": "file", "file": {"file_id": file_id} @@ -234,80 +230,80 @@ class OpenAILLM(BaseLLM): logging.error(f"Error uploading PDF to OpenAI: {e}") if 'content' in attachment: prepared_messages[user_message_index]["content"].append({ - "type": "text", + "type": "text", "text": f"File content:\n\n{attachment['content']}" }) - + return prepared_messages def _get_base64_image(self, attachment): """ Convert an image file to base64 encoding. - + Args: attachment (dict): Attachment dictionary with path and metadata. - + Returns: str: Base64-encoded image data. """ file_path = attachment.get('path') if not file_path: raise ValueError("No file path provided in attachment") - + if not os.path.isabs(file_path): current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) file_path = os.path.join(current_dir, "application", file_path) - + if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - + with open(file_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def _upload_file_to_openai(self, attachment): ##pdfs """ Upload a file to OpenAI and return the file_id. - + Args: attachment (dict): Attachment dictionary with path and metadata. Expected keys: - path: Path to the file - id: Optional MongoDB ID for caching - + Returns: str: OpenAI file_id for the uploaded file. """ import os import logging - + if 'openai_file_id' in attachment: return attachment['openai_file_id'] - + file_path = attachment.get('path') if not file_path: raise ValueError("No file path provided in attachment") - + if not os.path.isabs(file_path): current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) file_path = os.path.join(current_dir,"application", file_path) - + if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - + try: with open(file_path, 'rb') as file: response = self.client.files.create( file=file, purpose="assistants" ) - + file_id = response.id - + from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() db = mongo["docsgpt"] @@ -317,7 +313,7 @@ class OpenAILLM(BaseLLM): {"_id": attachment['_id']}, {"$set": {"openai_file_id": file_id}} ) - + return file_id except Exception as e: logging.error(f"Error uploading file to OpenAI: {e}") @@ -327,7 +323,7 @@ class OpenAILLM(BaseLLM): class AzureOpenAILLM(OpenAILLM): def __init__( - self, api_key, user_api_key, *args, **kwargs + self, api_key, user_api_key, *args, **kwargs ): super().__init__(api_key) From 5aa51f5f3620520002e3b20c03b0a3eddf1b3e39 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 18 Apr 2025 01:27:21 +0530 Subject: [PATCH 11/39] (feat:file_abstract) openai attachments comply --- application/llm/openai.py | 46 +++++++++++++++------------------------ 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/application/llm/openai.py b/application/llm/openai.py index b3e179c1..f36e87cb 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -6,6 +6,7 @@ import logging from application.core.settings import settings from application.llm.base import BaseLLM +from application.storage.storage_creator import StorageCreator class OpenAILLM(BaseLLM): @@ -20,6 +21,7 @@ class OpenAILLM(BaseLLM): self.client = OpenAI(api_key=api_key) self.api_key = api_key self.user_api_key = user_api_key + self.storage = StorageCreator.create_storage(getattr(settings, "STORAGE_TYPE", "local")) def _clean_messages_openai(self, messages): cleaned_messages = [] @@ -250,19 +252,13 @@ class OpenAILLM(BaseLLM): if not file_path: raise ValueError("No file path provided in attachment") - if not os.path.isabs(file_path): - current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - ) - file_path = os.path.join(current_dir, "application", file_path) - - if not os.path.exists(file_path): + try: + with self.storage.get_file(file_path) as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + except FileNotFoundError: raise FileNotFoundError(f"File not found: {file_path}") - with open(file_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') - - def _upload_file_to_openai(self, attachment): ##pdfs + def _upload_file_to_openai(self, attachment): """ Upload a file to OpenAI and return the file_id. @@ -275,34 +271,28 @@ class OpenAILLM(BaseLLM): Returns: str: OpenAI file_id for the uploaded file. """ - import os import logging if 'openai_file_id' in attachment: return attachment['openai_file_id'] file_path = attachment.get('path') - if not file_path: - raise ValueError("No file path provided in attachment") - if not os.path.isabs(file_path): - current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - ) - file_path = os.path.join(current_dir,"application", file_path) - - if not os.path.exists(file_path): + if not self.storage.file_exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - try: - with open(file_path, 'rb') as file: - response = self.client.files.create( - file=file, - purpose="assistants" - ) + # Use storage's process_file method to handle the file appropriately + def upload_to_openai(file_path, **kwargs): + with open(file_path, 'rb') as file: + logging.info(f"Uploading file to OpenAI: {file_path}") + response = self.client.files.create( + file=file, + purpose="assistants" + ) + return response.id - file_id = response.id + file_id = self.storage.process_file(file_path, upload_to_openai) from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() From c8efef8f04c08821657dc5427b7b21354d058ff7 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 18 Apr 2025 18:27:02 +0530 Subject: [PATCH 12/39] (fix:openai) image uplads, use lambda in process_files --- application/llm/openai.py | 23 +++++++++-------------- application/storage/local.py | 2 +- application/storage/s3.py | 10 +++++----- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/application/llm/openai.py b/application/llm/openai.py index f36e87cb..87eb295b 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -1,7 +1,5 @@ import json import base64 -import os -import mimetypes import logging from application.core.settings import settings @@ -79,6 +77,8 @@ class OpenAILLM(BaseLLM): content_parts.append(item) elif "type" in item and item["type"] == "file" and "file" in item: content_parts.append(item) + elif "type" in item and item["type"] == "image_url" and "image_url" in item: + content_parts.append(item) cleaned_messages.append({"role": role, "content": content_parts}) else: raise ValueError( @@ -223,7 +223,6 @@ class OpenAILLM(BaseLLM): elif mime_type == 'application/pdf': try: file_id = self._upload_file_to_openai(attachment) - prepared_messages[user_message_index]["content"].append({ "type": "file", "file": {"file_id": file_id} @@ -282,17 +281,13 @@ class OpenAILLM(BaseLLM): raise FileNotFoundError(f"File not found: {file_path}") try: - # Use storage's process_file method to handle the file appropriately - def upload_to_openai(file_path, **kwargs): - with open(file_path, 'rb') as file: - logging.info(f"Uploading file to OpenAI: {file_path}") - response = self.client.files.create( - file=file, - purpose="assistants" - ) - return response.id - - file_id = self.storage.process_file(file_path, upload_to_openai) + file_id = self.storage.process_file( + file_path, + lambda local_path, **kwargs: self.client.files.create( + file=open(local_path, 'rb'), + purpose="assistants" + ).id + ) from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() diff --git a/application/storage/local.py b/application/storage/local.py index db11b63c..fb21f08d 100644 --- a/application/storage/local.py +++ b/application/storage/local.py @@ -100,4 +100,4 @@ class LocalStorage(BaseStorage): if not os.path.exists(full_path): raise FileNotFoundError(f"File not found: {full_path}") - return processor_func(file_path=full_path, **kwargs) + return processor_func(local_path=full_path, **kwargs) diff --git a/application/storage/s3.py b/application/storage/s3.py index e8df210e..abc57c6d 100644 --- a/application/storage/s3.py +++ b/application/storage/s3.py @@ -98,23 +98,23 @@ class S3Storage(BaseStorage): path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function - + Returns: The result of the processor function """ import tempfile import logging - + if not self.file_exists(path): raise FileNotFoundError(f"File not found in S3: {path}") - + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(path)[1], delete=True) as temp_file: try: # Download the file from S3 to the temporary file self.s3.download_fileobj(self.bucket_name, path, temp_file) temp_file.flush() - result = processor_func(file_path=temp_file.name, **kwargs) - return result + + return processor_func(local_path=temp_file.name, **kwargs) except Exception as e: logging.error(f"Error processing S3 file {path}: {e}", exc_info=True) raise From c50ff6faa386c992b7e0875a18aa6226203997fe Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 18 Apr 2025 21:03:28 +0530 Subject: [PATCH 13/39] (feat:fs abstract) googleLLM class --- application/llm/google_ai.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/application/llm/google_ai.py b/application/llm/google_ai.py index 6d709ec2..0cfda686 100644 --- a/application/llm/google_ai.py +++ b/application/llm/google_ai.py @@ -1,11 +1,11 @@ from google import genai from google.genai import types -import os import logging -import mimetypes import json from application.llm.base import BaseLLM +from application.storage.storage_creator import StorageCreator +from application.core.settings import settings class GoogleLLM(BaseLLM): @@ -14,6 +14,7 @@ class GoogleLLM(BaseLLM): self.api_key = api_key self.user_api_key = user_api_key self.client = genai.Client(api_key=self.api_key) + self.storage = StorageCreator.create_storage(getattr(settings, "STORAGE_TYPE", "local")) def get_supported_attachment_types(self): """ @@ -109,19 +110,14 @@ class GoogleLLM(BaseLLM): if not file_path: raise ValueError("No file path provided in attachment") - if not os.path.isabs(file_path): - current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - ) - file_path = os.path.join(current_dir, "application", file_path) - - if not os.path.exists(file_path): + if not self.storage.file_exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: - response = self.client.files.upload(file=file_path) - - file_uri = response.uri + file_uri = self.storage.process_file( + file_path, + lambda local_path, **kwargs: self.client.files.upload(file=local_path).uri + ) from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() From 38476cfeb8dcc6aff8e3be97654e411131820bf9 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 22 Apr 2025 00:57:57 +0530 Subject: [PATCH 14/39] (gfeat:storage) get storage instance based on settings --- application/storage/storage_creator.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/application/storage/storage_creator.py b/application/storage/storage_creator.py index dcf64983..3eca2f47 100644 --- a/application/storage/storage_creator.py +++ b/application/storage/storage_creator.py @@ -4,6 +4,7 @@ from typing import Dict, Type from application.storage.base import BaseStorage from application.storage.local import LocalStorage from application.storage.s3 import S3Storage +from application.core.settings import settings class StorageCreator: @@ -12,6 +13,16 @@ class StorageCreator: "s3": S3Storage, } + _instance = None + + @classmethod + def get_storage(cls) -> BaseStorage: + if cls._instance is None: + storage_type = getattr(settings, "STORAGE_TYPE", "local") + cls._instance = cls.create_storage(storage_type) + + return cls._instance + @classmethod def create_storage(cls, type_name: str, *args, **kwargs) -> BaseStorage: storage_class = cls.storages.get(type_name.lower()) From 0a31ddaae6f449f3160589208bcfaffe3ee9913d Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 22 Apr 2025 01:41:53 +0530 Subject: [PATCH 15/39] (feat:storage) use get storage --- application/llm/google_ai.py | 3 +-- application/llm/openai.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/application/llm/google_ai.py b/application/llm/google_ai.py index 0cfda686..06dbbdfd 100644 --- a/application/llm/google_ai.py +++ b/application/llm/google_ai.py @@ -5,7 +5,6 @@ import json from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator -from application.core.settings import settings class GoogleLLM(BaseLLM): @@ -14,7 +13,7 @@ class GoogleLLM(BaseLLM): self.api_key = api_key self.user_api_key = user_api_key self.client = genai.Client(api_key=self.api_key) - self.storage = StorageCreator.create_storage(getattr(settings, "STORAGE_TYPE", "local")) + self.storage = StorageCreator.get_storage() def get_supported_attachment_types(self): """ diff --git a/application/llm/openai.py b/application/llm/openai.py index 87eb295b..e8df92dd 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -19,7 +19,7 @@ class OpenAILLM(BaseLLM): self.client = OpenAI(api_key=api_key) self.api_key = api_key self.user_api_key = user_api_key - self.storage = StorageCreator.create_storage(getattr(settings, "STORAGE_TYPE", "local")) + self.storage = StorageCreator.get_storage() def _clean_messages_openai(self, messages): cleaned_messages = [] From 64c42f0ddf75c0e0b9cca77050278b02672927c5 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 22 Apr 2025 05:18:07 +0530 Subject: [PATCH 16/39] (feat:storage) file, indexes uploads --- application/api/internal/routes.py | 20 ++-- application/api/user/routes.py | 84 ++++++++--------- application/worker.py | 144 ++++++++++++++++++----------- 3 files changed, 142 insertions(+), 106 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index c8e32d11..e95b6327 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -6,7 +6,7 @@ from bson.objectid import ObjectId from application.core.mongo_db import MongoDB from application.core.settings import settings - +from application.storage.storage_creator import StorageCreator mongo = MongoDB.get_client() db = mongo["docsgpt"] conversations_collection = db["conversations"] @@ -45,7 +45,8 @@ def upload_index_files(): remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None - save_dir = os.path.join(current_dir, "indexes", str(id)) + storage = StorageCreator.create_storage(settings.STORAGE_TYPE) + if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -59,12 +60,13 @@ def upload_index_files(): file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} - # saves index files - - if not os.path.exists(save_dir): - os.makedirs(save_dir) - file_faiss.save(os.path.join(save_dir, "index.faiss")) - file_pkl.save(os.path.join(save_dir, "index.pkl")) + + # Save index files + storage_path_faiss = f"indexes/{str(id)}/index.faiss" + storage_path_pkl = f"indexes/{str(id)}/index.pkl" + + storage.save_file(file_faiss, storage_path_faiss) + storage.save_file(file_pkl, storage_path_pkl) existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: @@ -82,6 +84,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "storage_type": settings.STORAGE_TYPE, } }, ) @@ -99,6 +102,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, + "storage_type": settings.STORAGE_TYPE, } ) return {"status": "ok"} diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 9e97e2ab..b7d79128 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -4,6 +4,7 @@ import math import os import shutil import uuid +import tempfile from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef @@ -21,6 +22,7 @@ from application.extensions import api from application.tts.google_tts import GoogleTTS from application.utils import check_required_fields, validate_function_name from application.vectorstore.vector_creator import VectorCreator +from application.storage.storage_creator import StorageCreator mongo = MongoDB.get_client() db = mongo["docsgpt"] @@ -413,54 +415,50 @@ class UploadFile(Resource): user = secure_filename(decoded_token.get("sub")) job_name = secure_filename(request.form["name"]) + storage = StorageCreator.get_storage() + try: - save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) - os.makedirs(save_dir, exist_ok=True) - if len(files) > 1: - temp_dir = os.path.join(save_dir, "temp") - os.makedirs(temp_dir, exist_ok=True) - - for file in files: - filename = secure_filename(file.filename) - file.save(os.path.join(temp_dir, filename)) - print(f"Saved file: {filename}") - zip_path = shutil.make_archive( - base_name=os.path.join(save_dir, job_name), - format="zip", - root_dir=temp_dir, - ) - final_filename = os.path.basename(zip_path) - shutil.rmtree(temp_dir) - task = ingest.delay( - settings.UPLOAD_FOLDER, - [ - ".rst", - ".md", - ".pdf", - ".txt", - ".docx", - ".csv", - ".epub", - ".html", - ".mdx", - ".json", - ".xlsx", - ".pptx", - ".png", - ".jpg", - ".jpeg", - ], - job_name, - final_filename, - user, - ) + temp_dir = tempfile.mkdtemp() + try: + for file in files: + filename = secure_filename(file.filename) + file.save(os.path.join(temp_dir, filename)) + + zip_path = os.path.join(temp_dir, f"{job_name}.zip") + shutil.make_archive( + base_name=os.path.join(temp_dir, job_name), + format="zip", + root_dir=temp_dir, + base_dir="." + ) + + final_filename = f"{job_name}.zip" + relative_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}/{final_filename}" + + with open(zip_path, 'rb') as zip_file: + storage.save_file(zip_file, relative_path) + + task = ingest.delay( + relative_path, + [ + ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", + ".epub", ".html", ".mdx", ".json", ".xlsx", + ".pptx", ".png", ".jpg", ".jpeg", + ], + job_name, + final_filename, + user, + ) + finally: + shutil.rmtree(temp_dir) else: file = files[0] final_filename = secure_filename(file.filename) - file_path = os.path.join(save_dir, final_filename) - file.save(file_path) - + relative_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}/{final_filename}" + + storage.save_file(file, relative_path) + task = ingest.delay( settings.UPLOAD_FOLDER, [ diff --git a/application/worker.py b/application/worker.py index b5caa23e..f8076260 100755 --- a/application/worker.py +++ b/application/worker.py @@ -133,71 +133,105 @@ def ingest_worker( limit = None exclude = True sample = False - full_path = os.path.join(directory, user, name_job) - - logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) - file_data = {"name": name_job, "file": filename, "user": user} + storage = StorageCreator.create_storage(settings.STORAGE_TYPE) + temp_dir = tempfile.mkdtemp() + full_path = os.path.join(temp_dir, name_job) + if not os.path.exists(full_path): os.makedirs(full_path) - download_file(urljoin(settings.API_URL, "/api/download"), file_data, os.path.join(full_path, filename)) - # check if file is .zip and extract it - if filename.endswith(".zip"): - extract_zip_recursive( - os.path.join(full_path, filename), full_path, 0, RECURSION_DEPTH + logging.info(f"Ingest file: {directory}/{user}/{name_job}/{filename}", extra={"user": user, "job": name_job}) + file_data = {"name": name_job, "file": filename, "user": user} + + try: + file_path = f"{directory}/{user}/{name_job}/{filename}" + + try: + file_obj = storage.get_file(file_path) + + local_file_path = os.path.join(full_path, filename) + with open(local_file_path, 'wb') as f: + shutil.copyfileobj(file_obj, f) + + # check if file is .zip and extract it + if filename.endswith(".zip"): + extract_zip_recursive( + os.path.join(full_path, filename), full_path, 0, RECURSION_DEPTH + ) + except FileNotFoundError as e: + logging.error(f"File not found in storage: {file_path}") + raise FileNotFoundError(f"File not found: {file_path}") from e + + self.update_state(state="PROGRESS", meta={"current": 1}) + + raw_docs = SimpleDirectoryReader( + input_dir=full_path, + input_files=input_files, + recursive=recursive, + required_exts=formats, + num_files_limit=limit, + exclude_hidden=exclude, + file_metadata=metadata_from_filename, + ).load_data() + + chunker = Chunker( + chunking_strategy="classic_chunk", + max_tokens=MAX_TOKENS, + min_tokens=MIN_TOKENS, + duplicate_headers=False ) + raw_docs = chunker.chunk(documents=raw_docs) - self.update_state(state="PROGRESS", meta={"current": 1}) + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + id = ObjectId() - raw_docs = SimpleDirectoryReader( - input_dir=full_path, - input_files=input_files, - recursive=recursive, - required_exts=formats, - num_files_limit=limit, - exclude_hidden=exclude, - file_metadata=metadata_from_filename, - ).load_data() + vector_dir = os.path.join(temp_dir, "vector_store") + os.makedirs(vector_dir, exist_ok=True) + + embed_and_store_documents(docs, vector_dir, str(id), self) + tokens = count_tokens_docs(docs) + self.update_state(state="PROGRESS", meta={"current": 100}) - chunker = Chunker( - chunking_strategy="classic_chunk", - max_tokens=MAX_TOKENS, - min_tokens=MIN_TOKENS, - duplicate_headers=False - ) - raw_docs = chunker.chunk(documents=raw_docs) + if sample: + for i in range(min(5, len(raw_docs))): + logging.info(f"Sample document {i}: {raw_docs[i]}") - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - id = ObjectId() + file_data.update({ + "tokens": tokens, + "retriever": retriever, + "id": str(id), + "type": "local", + }) + + mongo = MongoDB.get_client() + db = mongo["docsgpt"] + sources_collection = db["sources"] + + sources_collection.insert_one({ + "_id": id, + "name": name_job, + "user": user, + "date": datetime.datetime.now(), + "tokens": tokens, + "retriever": retriever, + "type": "local", + "storage_type": settings.STORAGE_TYPE, + "original_file_path": file_path + }) - embed_and_store_documents(docs, full_path, id, self) - tokens = count_tokens_docs(docs) - self.update_state(state="PROGRESS", meta={"current": 100}) - - if sample: - for i in range(min(5, len(raw_docs))): - logging.info(f"Sample document {i}: {raw_docs[i]}") - - file_data.update({ - "tokens": tokens, - "retriever": retriever, - "id": str(id), - "type": "local", - }) - upload_index(full_path, file_data) - - # delete local - shutil.rmtree(full_path) - - return { - "directory": directory, - "formats": formats, - "name_job": name_job, - "filename": filename, - "user": user, - "limited": False, - } + return { + "directory": directory, + "formats": formats, + "name_job": name_job, + "filename": filename, + "user": user, + "limited": False, + } + + finally: + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) def remote_worker( self, From 5ad34e2216e3052f075059afbf2026043047bf06 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 22 Apr 2025 17:34:25 +0530 Subject: [PATCH 17/39] (fix:indexes) look for the right path --- application/api/internal/routes.py | 16 ++-- application/parser/embedding_pipeline.py | 6 +- application/vectorstore/faiss.py | 110 +++++++++++++++++++---- application/worker.py | 30 ++++--- 4 files changed, 128 insertions(+), 34 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index e95b6327..6ba07431 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -1,5 +1,6 @@ import os import datetime +import logging from flask import Blueprint, request, send_from_directory from werkzeug.utils import secure_filename from bson.objectid import ObjectId @@ -46,7 +47,7 @@ def upload_index_files(): sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None storage = StorageCreator.create_storage(settings.STORAGE_TYPE) - + if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -60,13 +61,18 @@ def upload_index_files(): file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} - + # Save index files storage_path_faiss = f"indexes/{str(id)}/index.faiss" storage_path_pkl = f"indexes/{str(id)}/index.pkl" - - storage.save_file(file_faiss, storage_path_faiss) - storage.save_file(file_pkl, storage_path_pkl) + + try: + storage.save_file(file_faiss, storage_path_faiss) + storage.save_file(file_pkl, storage_path_pkl) + logging.info(f"Successfully saved FAISS index files for ID {id}") + except Exception as e: + logging.error(f"Error saving FAISS index files: {e}") + return {"status": "error", "message": str(e)} existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 0435cd14..005d3756 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -42,17 +42,18 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): # Initialize vector store if settings.VECTOR_STORE == "faiss": + docs_init = [docs.pop(0)] store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, docs_init=docs_init, - source_id=folder_name, + source_id=str(source_id), embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) else: store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, - source_id=source_id, + source_id=str(source_id), embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) store.delete_index() @@ -82,5 +83,6 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): # Save the vector store if settings.VECTOR_STORE == "faiss": + # For FAISS, save to the temporary folder first store.save_local(folder_name) logging.info("Vector store saved successfully.") diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 87ffcccb..5a38f966 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,35 +1,45 @@ import os +import tempfile +import logging from langchain_community.vectorstores import FAISS from application.core.settings import settings from application.parser.schema.base import Document from application.vectorstore.base import BaseVectorStore +from application.storage.storage_creator import StorageCreator -def get_vectorstore(path: str) -> str: - if path: - vectorstore = os.path.join("application", "indexes", path) +def get_vectorstore_path(source_id: str) -> str: + if source_id: + clean_id = source_id.replace("application/indexes/", "").rstrip("/") + return f"indexes/{clean_id}" else: - vectorstore = os.path.join("application") - return vectorstore - + return "indexes" class FaissStore(BaseVectorStore): def __init__(self, source_id: str, embeddings_key: str, docs_init=None): super().__init__() self.source_id = source_id - self.path = get_vectorstore(source_id) + self.storage = StorageCreator.get_storage() + self.storage_path = get_vectorstore_path(source_id) self.embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) try: if docs_init: self.docsearch = FAISS.from_documents(docs_init, self.embeddings) else: - self.docsearch = FAISS.load_local( - self.path, self.embeddings, allow_dangerous_deserialization=True - ) - except Exception: + if self.storage.__class__.__name__ == "LocalStorage": + # For local storage, we can use the path directly + local_path = self.storage._get_full_path(self.storage_path) + self.docsearch = FAISS.load_local( + local_path, self.embeddings, allow_dangerous_deserialization=True + ) + else: + # For non-local storage (S3, etc.), download files to temp directory first + self.docsearch = self._load_from_remote_storage() + except Exception as e: + logging.error(f"Error initializing FAISS store: {e}") raise self.assert_embedding_dimensions(self.embeddings) @@ -40,8 +50,26 @@ class FaissStore(BaseVectorStore): def add_texts(self, *args, **kwargs): return self.docsearch.add_texts(*args, **kwargs) - def save_local(self, *args, **kwargs): - return self.docsearch.save_local(*args, **kwargs) + def save_local(self, folder_path=None): + path_to_use = folder_path or self.storage_path + + if folder_path or self.storage.__class__.__name__ == "LocalStorage": + # If it's a local path or temp dir, save directly + local_path = path_to_use + if self.storage.__class__.__name__ == "LocalStorage" and not folder_path: + local_path = self.storage._get_full_path(path_to_use) + + os.makedirs(os.path.dirname(local_path) if os.path.dirname(local_path) else local_path, exist_ok=True) + + self.docsearch.save_local(local_path) + + if folder_path and self.storage.__class__.__name__ != "LocalStorage": + self._upload_index_to_remote(folder_path) + else: + # For remote storage, save to temp dir first, then upload + with tempfile.TemporaryDirectory() as temp_dir: + self.docsearch.save_local(temp_dir) + self._upload_index_to_remote(temp_dir) def delete_index(self, *args, **kwargs): return self.docsearch.delete(*args, **kwargs) @@ -80,10 +108,62 @@ class FaissStore(BaseVectorStore): metadata = metadata or {} doc = Document(text=text, extra_info=metadata).to_langchain_format() doc_id = self.docsearch.add_documents([doc]) - self.save_local(self.path) + self.save_local() return doc_id def delete_chunk(self, chunk_id): self.delete_index([chunk_id]) - self.save_local(self.path) + self.save_local() return True + + def _load_from_remote_storage(self): + with tempfile.TemporaryDirectory() as temp_dir: + try: + # Check if both index files exist in remote storage + faiss_path = f"{self.storage_path}/index.faiss" + pkl_path = f"{self.storage_path}/index.pkl" + + if not self.storage.file_exists(faiss_path) or not self.storage.file_exists(pkl_path): + raise FileNotFoundError(f"FAISS index files not found at {self.storage_path}") + + # Download both files to temp directory + faiss_file = self.storage.get_file(faiss_path) + pkl_file = self.storage.get_file(pkl_path) + + local_faiss_path = os.path.join(temp_dir, "index.faiss") + local_pkl_path = os.path.join(temp_dir, "index.pkl") + + with open(local_faiss_path, 'wb') as f: + f.write(faiss_file.read()) + + with open(local_pkl_path, 'wb') as f: + f.write(pkl_file.read()) + + # Load the index from the temp directory + return FAISS.load_local( + temp_dir, self.embeddings, allow_dangerous_deserialization=True + ) + except Exception as e: + logging.error(f"Error loading FAISS index from remote storage: {e}") + raise + + def _upload_index_to_remote(self, local_folder): + try: + # Get paths to the index files + local_faiss_path = os.path.join(local_folder, "index.faiss") + local_pkl_path = os.path.join(local_folder, "index.pkl") + + remote_faiss_path = f"{self.storage_path}/index.faiss" + remote_pkl_path = f"{self.storage_path}/index.pkl" + + # Upload both files to remote storage + with open(local_faiss_path, 'rb') as f: + self.storage.save_file(f, remote_faiss_path) + + with open(local_pkl_path, 'rb') as f: + self.storage.save_file(f, remote_pkl_path) + + logging.info(f"Successfully uploaded FAISS index to {self.storage_path}") + except Exception as e: + logging.error(f"Error uploading FAISS index to remote storage: {e}") + raise diff --git a/application/worker.py b/application/worker.py index f8076260..5d32cf66 100755 --- a/application/worker.py +++ b/application/worker.py @@ -89,9 +89,12 @@ def download_file(url, params, dest_path): def upload_index(full_path, file_data): try: if settings.VECTOR_STORE == "faiss": + faiss_path = os.path.join(full_path, "index.faiss") + pkl_path = os.path.join(full_path, "index.pkl") + files = { - "file_faiss": open(full_path + "/index.faiss", "rb"), - "file_pkl": open(full_path + "/index.pkl", "rb"), + "file_faiss": open(faiss_path, "rb"), + "file_pkl": open(pkl_path, "rb"), } response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data @@ -104,8 +107,11 @@ def upload_index(full_path, file_data): except requests.RequestException as e: logging.error(f"Error uploading index: {e}") raise + except FileNotFoundError as e: + logging.error(f"File not found: {e}") + raise finally: - if settings.VECTOR_STORE == "faiss": + if settings.VECTOR_STORE == "faiss" and 'files' in locals(): for file in files.values(): file.close() @@ -137,23 +143,23 @@ def ingest_worker( storage = StorageCreator.create_storage(settings.STORAGE_TYPE) temp_dir = tempfile.mkdtemp() full_path = os.path.join(temp_dir, name_job) - + if not os.path.exists(full_path): os.makedirs(full_path) logging.info(f"Ingest file: {directory}/{user}/{name_job}/{filename}", extra={"user": user, "job": name_job}) file_data = {"name": name_job, "file": filename, "user": user} - + try: file_path = f"{directory}/{user}/{name_job}/{filename}" - + try: file_obj = storage.get_file(file_path) - + local_file_path = os.path.join(full_path, filename) with open(local_file_path, 'wb') as f: shutil.copyfileobj(file_obj, f) - + # check if file is .zip and extract it if filename.endswith(".zip"): extract_zip_recursive( @@ -188,7 +194,7 @@ def ingest_worker( vector_dir = os.path.join(temp_dir, "vector_store") os.makedirs(vector_dir, exist_ok=True) - + embed_and_store_documents(docs, vector_dir, str(id), self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) @@ -203,11 +209,11 @@ def ingest_worker( "id": str(id), "type": "local", }) - + mongo = MongoDB.get_client() db = mongo["docsgpt"] sources_collection = db["sources"] - + sources_collection.insert_one({ "_id": id, "name": name_job, @@ -228,7 +234,7 @@ def ingest_worker( "user": user, "limited": False, } - + finally: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) From 24c8b24b1f367669d208cf93e07895b3d9b896f2 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Apr 2025 00:52:22 +0530 Subject: [PATCH 18/39] Revert "(fix:indexes) look for the right path" This reverts commit 5ad34e2216e3052f075059afbf2026043047bf06. --- application/api/internal/routes.py | 16 ++-- application/parser/embedding_pipeline.py | 6 +- application/vectorstore/faiss.py | 110 ++++------------------- application/worker.py | 30 +++---- 4 files changed, 34 insertions(+), 128 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 6ba07431..e95b6327 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -1,6 +1,5 @@ import os import datetime -import logging from flask import Blueprint, request, send_from_directory from werkzeug.utils import secure_filename from bson.objectid import ObjectId @@ -47,7 +46,7 @@ def upload_index_files(): sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None storage = StorageCreator.create_storage(settings.STORAGE_TYPE) - + if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -61,18 +60,13 @@ def upload_index_files(): file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} - + # Save index files storage_path_faiss = f"indexes/{str(id)}/index.faiss" storage_path_pkl = f"indexes/{str(id)}/index.pkl" - - try: - storage.save_file(file_faiss, storage_path_faiss) - storage.save_file(file_pkl, storage_path_pkl) - logging.info(f"Successfully saved FAISS index files for ID {id}") - except Exception as e: - logging.error(f"Error saving FAISS index files: {e}") - return {"status": "error", "message": str(e)} + + storage.save_file(file_faiss, storage_path_faiss) + storage.save_file(file_pkl, storage_path_pkl) existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 005d3756..0435cd14 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -42,18 +42,17 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): # Initialize vector store if settings.VECTOR_STORE == "faiss": - docs_init = [docs.pop(0)] store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, docs_init=docs_init, - source_id=str(source_id), + source_id=folder_name, embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) else: store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, - source_id=str(source_id), + source_id=source_id, embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) store.delete_index() @@ -83,6 +82,5 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): # Save the vector store if settings.VECTOR_STORE == "faiss": - # For FAISS, save to the temporary folder first store.save_local(folder_name) logging.info("Vector store saved successfully.") diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 5a38f966..87ffcccb 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,45 +1,35 @@ import os -import tempfile -import logging from langchain_community.vectorstores import FAISS from application.core.settings import settings from application.parser.schema.base import Document from application.vectorstore.base import BaseVectorStore -from application.storage.storage_creator import StorageCreator -def get_vectorstore_path(source_id: str) -> str: - if source_id: - clean_id = source_id.replace("application/indexes/", "").rstrip("/") - return f"indexes/{clean_id}" +def get_vectorstore(path: str) -> str: + if path: + vectorstore = os.path.join("application", "indexes", path) else: - return "indexes" + vectorstore = os.path.join("application") + return vectorstore + class FaissStore(BaseVectorStore): def __init__(self, source_id: str, embeddings_key: str, docs_init=None): super().__init__() self.source_id = source_id - self.storage = StorageCreator.get_storage() - self.storage_path = get_vectorstore_path(source_id) + self.path = get_vectorstore(source_id) self.embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) try: if docs_init: self.docsearch = FAISS.from_documents(docs_init, self.embeddings) else: - if self.storage.__class__.__name__ == "LocalStorage": - # For local storage, we can use the path directly - local_path = self.storage._get_full_path(self.storage_path) - self.docsearch = FAISS.load_local( - local_path, self.embeddings, allow_dangerous_deserialization=True - ) - else: - # For non-local storage (S3, etc.), download files to temp directory first - self.docsearch = self._load_from_remote_storage() - except Exception as e: - logging.error(f"Error initializing FAISS store: {e}") + self.docsearch = FAISS.load_local( + self.path, self.embeddings, allow_dangerous_deserialization=True + ) + except Exception: raise self.assert_embedding_dimensions(self.embeddings) @@ -50,26 +40,8 @@ class FaissStore(BaseVectorStore): def add_texts(self, *args, **kwargs): return self.docsearch.add_texts(*args, **kwargs) - def save_local(self, folder_path=None): - path_to_use = folder_path or self.storage_path - - if folder_path or self.storage.__class__.__name__ == "LocalStorage": - # If it's a local path or temp dir, save directly - local_path = path_to_use - if self.storage.__class__.__name__ == "LocalStorage" and not folder_path: - local_path = self.storage._get_full_path(path_to_use) - - os.makedirs(os.path.dirname(local_path) if os.path.dirname(local_path) else local_path, exist_ok=True) - - self.docsearch.save_local(local_path) - - if folder_path and self.storage.__class__.__name__ != "LocalStorage": - self._upload_index_to_remote(folder_path) - else: - # For remote storage, save to temp dir first, then upload - with tempfile.TemporaryDirectory() as temp_dir: - self.docsearch.save_local(temp_dir) - self._upload_index_to_remote(temp_dir) + def save_local(self, *args, **kwargs): + return self.docsearch.save_local(*args, **kwargs) def delete_index(self, *args, **kwargs): return self.docsearch.delete(*args, **kwargs) @@ -108,62 +80,10 @@ class FaissStore(BaseVectorStore): metadata = metadata or {} doc = Document(text=text, extra_info=metadata).to_langchain_format() doc_id = self.docsearch.add_documents([doc]) - self.save_local() + self.save_local(self.path) return doc_id def delete_chunk(self, chunk_id): self.delete_index([chunk_id]) - self.save_local() + self.save_local(self.path) return True - - def _load_from_remote_storage(self): - with tempfile.TemporaryDirectory() as temp_dir: - try: - # Check if both index files exist in remote storage - faiss_path = f"{self.storage_path}/index.faiss" - pkl_path = f"{self.storage_path}/index.pkl" - - if not self.storage.file_exists(faiss_path) or not self.storage.file_exists(pkl_path): - raise FileNotFoundError(f"FAISS index files not found at {self.storage_path}") - - # Download both files to temp directory - faiss_file = self.storage.get_file(faiss_path) - pkl_file = self.storage.get_file(pkl_path) - - local_faiss_path = os.path.join(temp_dir, "index.faiss") - local_pkl_path = os.path.join(temp_dir, "index.pkl") - - with open(local_faiss_path, 'wb') as f: - f.write(faiss_file.read()) - - with open(local_pkl_path, 'wb') as f: - f.write(pkl_file.read()) - - # Load the index from the temp directory - return FAISS.load_local( - temp_dir, self.embeddings, allow_dangerous_deserialization=True - ) - except Exception as e: - logging.error(f"Error loading FAISS index from remote storage: {e}") - raise - - def _upload_index_to_remote(self, local_folder): - try: - # Get paths to the index files - local_faiss_path = os.path.join(local_folder, "index.faiss") - local_pkl_path = os.path.join(local_folder, "index.pkl") - - remote_faiss_path = f"{self.storage_path}/index.faiss" - remote_pkl_path = f"{self.storage_path}/index.pkl" - - # Upload both files to remote storage - with open(local_faiss_path, 'rb') as f: - self.storage.save_file(f, remote_faiss_path) - - with open(local_pkl_path, 'rb') as f: - self.storage.save_file(f, remote_pkl_path) - - logging.info(f"Successfully uploaded FAISS index to {self.storage_path}") - except Exception as e: - logging.error(f"Error uploading FAISS index to remote storage: {e}") - raise diff --git a/application/worker.py b/application/worker.py index 5d32cf66..f8076260 100755 --- a/application/worker.py +++ b/application/worker.py @@ -89,12 +89,9 @@ def download_file(url, params, dest_path): def upload_index(full_path, file_data): try: if settings.VECTOR_STORE == "faiss": - faiss_path = os.path.join(full_path, "index.faiss") - pkl_path = os.path.join(full_path, "index.pkl") - files = { - "file_faiss": open(faiss_path, "rb"), - "file_pkl": open(pkl_path, "rb"), + "file_faiss": open(full_path + "/index.faiss", "rb"), + "file_pkl": open(full_path + "/index.pkl", "rb"), } response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data @@ -107,11 +104,8 @@ def upload_index(full_path, file_data): except requests.RequestException as e: logging.error(f"Error uploading index: {e}") raise - except FileNotFoundError as e: - logging.error(f"File not found: {e}") - raise finally: - if settings.VECTOR_STORE == "faiss" and 'files' in locals(): + if settings.VECTOR_STORE == "faiss": for file in files.values(): file.close() @@ -143,23 +137,23 @@ def ingest_worker( storage = StorageCreator.create_storage(settings.STORAGE_TYPE) temp_dir = tempfile.mkdtemp() full_path = os.path.join(temp_dir, name_job) - + if not os.path.exists(full_path): os.makedirs(full_path) logging.info(f"Ingest file: {directory}/{user}/{name_job}/{filename}", extra={"user": user, "job": name_job}) file_data = {"name": name_job, "file": filename, "user": user} - + try: file_path = f"{directory}/{user}/{name_job}/{filename}" - + try: file_obj = storage.get_file(file_path) - + local_file_path = os.path.join(full_path, filename) with open(local_file_path, 'wb') as f: shutil.copyfileobj(file_obj, f) - + # check if file is .zip and extract it if filename.endswith(".zip"): extract_zip_recursive( @@ -194,7 +188,7 @@ def ingest_worker( vector_dir = os.path.join(temp_dir, "vector_store") os.makedirs(vector_dir, exist_ok=True) - + embed_and_store_documents(docs, vector_dir, str(id), self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) @@ -209,11 +203,11 @@ def ingest_worker( "id": str(id), "type": "local", }) - + mongo = MongoDB.get_client() db = mongo["docsgpt"] sources_collection = db["sources"] - + sources_collection.insert_one({ "_id": id, "name": name_job, @@ -234,7 +228,7 @@ def ingest_worker( "user": user, "limited": False, } - + finally: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) From 637d3a24a1d7adb6ed78e6b3be22855d195fc4af Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Apr 2025 00:52:55 +0530 Subject: [PATCH 19/39] Revert "(feat:storage) file, indexes uploads" This reverts commit 64c42f0ddf75c0e0b9cca77050278b02672927c5. --- application/api/internal/routes.py | 20 ++-- application/api/user/routes.py | 84 +++++++++-------- application/worker.py | 144 +++++++++++------------------ 3 files changed, 106 insertions(+), 142 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index e95b6327..c8e32d11 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -6,7 +6,7 @@ from bson.objectid import ObjectId from application.core.mongo_db import MongoDB from application.core.settings import settings -from application.storage.storage_creator import StorageCreator + mongo = MongoDB.get_client() db = mongo["docsgpt"] conversations_collection = db["conversations"] @@ -45,8 +45,7 @@ def upload_index_files(): remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None - storage = StorageCreator.create_storage(settings.STORAGE_TYPE) - + save_dir = os.path.join(current_dir, "indexes", str(id)) if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: print("No file part") @@ -60,13 +59,12 @@ def upload_index_files(): file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} - - # Save index files - storage_path_faiss = f"indexes/{str(id)}/index.faiss" - storage_path_pkl = f"indexes/{str(id)}/index.pkl" - - storage.save_file(file_faiss, storage_path_faiss) - storage.save_file(file_pkl, storage_path_pkl) + # saves index files + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + file_faiss.save(os.path.join(save_dir, "index.faiss")) + file_pkl.save(os.path.join(save_dir, "index.pkl")) existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: @@ -84,7 +82,6 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, - "storage_type": settings.STORAGE_TYPE, } }, ) @@ -102,7 +99,6 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, - "storage_type": settings.STORAGE_TYPE, } ) return {"status": "ok"} diff --git a/application/api/user/routes.py b/application/api/user/routes.py index b7d79128..9e97e2ab 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -4,7 +4,6 @@ import math import os import shutil import uuid -import tempfile from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef @@ -22,7 +21,6 @@ from application.extensions import api from application.tts.google_tts import GoogleTTS from application.utils import check_required_fields, validate_function_name from application.vectorstore.vector_creator import VectorCreator -from application.storage.storage_creator import StorageCreator mongo = MongoDB.get_client() db = mongo["docsgpt"] @@ -415,50 +413,54 @@ class UploadFile(Resource): user = secure_filename(decoded_token.get("sub")) job_name = secure_filename(request.form["name"]) - storage = StorageCreator.get_storage() - try: + save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) + os.makedirs(save_dir, exist_ok=True) + if len(files) > 1: - temp_dir = tempfile.mkdtemp() - try: - for file in files: - filename = secure_filename(file.filename) - file.save(os.path.join(temp_dir, filename)) - - zip_path = os.path.join(temp_dir, f"{job_name}.zip") - shutil.make_archive( - base_name=os.path.join(temp_dir, job_name), - format="zip", - root_dir=temp_dir, - base_dir="." - ) - - final_filename = f"{job_name}.zip" - relative_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}/{final_filename}" - - with open(zip_path, 'rb') as zip_file: - storage.save_file(zip_file, relative_path) - - task = ingest.delay( - relative_path, - [ - ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", - ".epub", ".html", ".mdx", ".json", ".xlsx", - ".pptx", ".png", ".jpg", ".jpeg", - ], - job_name, - final_filename, - user, - ) - finally: - shutil.rmtree(temp_dir) + temp_dir = os.path.join(save_dir, "temp") + os.makedirs(temp_dir, exist_ok=True) + + for file in files: + filename = secure_filename(file.filename) + file.save(os.path.join(temp_dir, filename)) + print(f"Saved file: {filename}") + zip_path = shutil.make_archive( + base_name=os.path.join(save_dir, job_name), + format="zip", + root_dir=temp_dir, + ) + final_filename = os.path.basename(zip_path) + shutil.rmtree(temp_dir) + task = ingest.delay( + settings.UPLOAD_FOLDER, + [ + ".rst", + ".md", + ".pdf", + ".txt", + ".docx", + ".csv", + ".epub", + ".html", + ".mdx", + ".json", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ], + job_name, + final_filename, + user, + ) else: file = files[0] final_filename = secure_filename(file.filename) - relative_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}/{final_filename}" - - storage.save_file(file, relative_path) - + file_path = os.path.join(save_dir, final_filename) + file.save(file_path) + task = ingest.delay( settings.UPLOAD_FOLDER, [ diff --git a/application/worker.py b/application/worker.py index f8076260..b5caa23e 100755 --- a/application/worker.py +++ b/application/worker.py @@ -133,105 +133,71 @@ def ingest_worker( limit = None exclude = True sample = False + full_path = os.path.join(directory, user, name_job) + + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) + file_data = {"name": name_job, "file": filename, "user": user} - storage = StorageCreator.create_storage(settings.STORAGE_TYPE) - temp_dir = tempfile.mkdtemp() - full_path = os.path.join(temp_dir, name_job) - if not os.path.exists(full_path): os.makedirs(full_path) + download_file(urljoin(settings.API_URL, "/api/download"), file_data, os.path.join(full_path, filename)) - logging.info(f"Ingest file: {directory}/{user}/{name_job}/{filename}", extra={"user": user, "job": name_job}) - file_data = {"name": name_job, "file": filename, "user": user} - - try: - file_path = f"{directory}/{user}/{name_job}/{filename}" - - try: - file_obj = storage.get_file(file_path) - - local_file_path = os.path.join(full_path, filename) - with open(local_file_path, 'wb') as f: - shutil.copyfileobj(file_obj, f) - - # check if file is .zip and extract it - if filename.endswith(".zip"): - extract_zip_recursive( - os.path.join(full_path, filename), full_path, 0, RECURSION_DEPTH - ) - except FileNotFoundError as e: - logging.error(f"File not found in storage: {file_path}") - raise FileNotFoundError(f"File not found: {file_path}") from e - - self.update_state(state="PROGRESS", meta={"current": 1}) - - raw_docs = SimpleDirectoryReader( - input_dir=full_path, - input_files=input_files, - recursive=recursive, - required_exts=formats, - num_files_limit=limit, - exclude_hidden=exclude, - file_metadata=metadata_from_filename, - ).load_data() - - chunker = Chunker( - chunking_strategy="classic_chunk", - max_tokens=MAX_TOKENS, - min_tokens=MIN_TOKENS, - duplicate_headers=False + # check if file is .zip and extract it + if filename.endswith(".zip"): + extract_zip_recursive( + os.path.join(full_path, filename), full_path, 0, RECURSION_DEPTH ) - raw_docs = chunker.chunk(documents=raw_docs) - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - id = ObjectId() + self.update_state(state="PROGRESS", meta={"current": 1}) - vector_dir = os.path.join(temp_dir, "vector_store") - os.makedirs(vector_dir, exist_ok=True) - - embed_and_store_documents(docs, vector_dir, str(id), self) - tokens = count_tokens_docs(docs) - self.update_state(state="PROGRESS", meta={"current": 100}) + raw_docs = SimpleDirectoryReader( + input_dir=full_path, + input_files=input_files, + recursive=recursive, + required_exts=formats, + num_files_limit=limit, + exclude_hidden=exclude, + file_metadata=metadata_from_filename, + ).load_data() - if sample: - for i in range(min(5, len(raw_docs))): - logging.info(f"Sample document {i}: {raw_docs[i]}") + chunker = Chunker( + chunking_strategy="classic_chunk", + max_tokens=MAX_TOKENS, + min_tokens=MIN_TOKENS, + duplicate_headers=False + ) + raw_docs = chunker.chunk(documents=raw_docs) - file_data.update({ - "tokens": tokens, - "retriever": retriever, - "id": str(id), - "type": "local", - }) - - mongo = MongoDB.get_client() - db = mongo["docsgpt"] - sources_collection = db["sources"] - - sources_collection.insert_one({ - "_id": id, - "name": name_job, - "user": user, - "date": datetime.datetime.now(), - "tokens": tokens, - "retriever": retriever, - "type": "local", - "storage_type": settings.STORAGE_TYPE, - "original_file_path": file_path - }) + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + id = ObjectId() - return { - "directory": directory, - "formats": formats, - "name_job": name_job, - "filename": filename, - "user": user, - "limited": False, - } - - finally: - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) + embed_and_store_documents(docs, full_path, id, self) + tokens = count_tokens_docs(docs) + self.update_state(state="PROGRESS", meta={"current": 100}) + + if sample: + for i in range(min(5, len(raw_docs))): + logging.info(f"Sample document {i}: {raw_docs[i]}") + + file_data.update({ + "tokens": tokens, + "retriever": retriever, + "id": str(id), + "type": "local", + }) + upload_index(full_path, file_data) + + # delete local + shutil.rmtree(full_path) + + return { + "directory": directory, + "formats": formats, + "name_job": name_job, + "filename": filename, + "user": user, + "limited": False, + } def remote_worker( self, From e60f78ac4afb8d10992fa5cd8e787649d3e00be6 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Apr 2025 03:39:35 +0530 Subject: [PATCH 20/39] (feat:storage) file uploads --- application/api/user/routes.py | 104 ++++++++++++++-------------- application/worker.py | 121 ++++++++++++++++++++------------- 2 files changed, 129 insertions(+), 96 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 9e97e2ab..6b52a436 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -413,81 +413,85 @@ class UploadFile(Resource): user = secure_filename(decoded_token.get("sub")) job_name = secure_filename(request.form["name"]) + try: - save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) - os.makedirs(save_dir, exist_ok=True) - + from application.storage.storage_creator import StorageCreator + storage = StorageCreator.get_storage() + + base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" + if len(files) > 1: - temp_dir = os.path.join(save_dir, "temp") - os.makedirs(temp_dir, exist_ok=True) - + temp_files = [] for file in files: filename = secure_filename(file.filename) - file.save(os.path.join(temp_dir, filename)) + temp_path = f"{base_path}/temp/{filename}" + storage.save_file(file, temp_path) + temp_files.append(temp_path) print(f"Saved file: {filename}") - zip_path = shutil.make_archive( - base_name=os.path.join(save_dir, job_name), - format="zip", - root_dir=temp_dir, - ) - final_filename = os.path.basename(zip_path) - shutil.rmtree(temp_dir) + + zip_filename = f"{job_name}.zip" + zip_path = f"{base_path}/{zip_filename}" + + def create_zip_archive(temp_paths, **kwargs): + import tempfile + with tempfile.TemporaryDirectory() as temp_dir: + for path in temp_paths: + file_data = storage.get_file(path) + with open(os.path.join(temp_dir, os.path.basename(path)), 'wb') as f: + f.write(file_data.read()) + + # Create zip archive + zip_temp = shutil.make_archive( + base_name=os.path.join(temp_dir, job_name), + format="zip", + root_dir=temp_dir + ) + + return zip_temp + + zip_temp_path = create_zip_archive(temp_files) + with open(zip_temp_path, 'rb') as zip_file: + storage.save_file(zip_file, zip_path) + + # Clean up temp files + for temp_path in temp_files: + storage.delete_file(temp_path) + task = ingest.delay( settings.UPLOAD_FOLDER, [ - ".rst", - ".md", - ".pdf", - ".txt", - ".docx", - ".csv", - ".epub", - ".html", - ".mdx", - ".json", - ".xlsx", - ".pptx", - ".png", - ".jpg", - ".jpeg", + ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", + ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", + ".jpg", ".jpeg", ], job_name, - final_filename, + zip_filename, user, ) else: + # For single file file = files[0] - final_filename = secure_filename(file.filename) - file_path = os.path.join(save_dir, final_filename) - file.save(file_path) - + filename = secure_filename(file.filename) + file_path = f"{base_path}/{filename}" + + storage.save_file(file, file_path) + task = ingest.delay( settings.UPLOAD_FOLDER, [ - ".rst", - ".md", - ".pdf", - ".txt", - ".docx", - ".csv", - ".epub", - ".html", - ".mdx", - ".json", - ".xlsx", - ".pptx", - ".png", - ".jpg", - ".jpeg", + ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", + ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", + ".jpg", ".jpeg", ], job_name, - final_filename, + filename, user, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}") return make_response(jsonify({"success": False}), 400) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) diff --git a/application/worker.py b/application/worker.py index b5caa23e..d83639d7 100755 --- a/application/worker.py +++ b/application/worker.py @@ -133,62 +133,91 @@ def ingest_worker( limit = None exclude = True sample = False + + storage = StorageCreator.get_storage() + full_path = os.path.join(directory, user, name_job) - + source_file_path = os.path.join(full_path, filename) + logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) - file_data = {"name": name_job, "file": filename, "user": user} + + # Create temporary working directory + with tempfile.TemporaryDirectory() as temp_dir: + try: + os.makedirs(temp_dir, exist_ok=True) + + # Download file from storage to temp directory + temp_file_path = os.path.join(temp_dir, filename) + file_data = storage.get_file(source_file_path) + + with open(temp_file_path, 'wb') as f: + f.write(file_data.read()) + + self.update_state(state="PROGRESS", meta={"current": 1}) - if not os.path.exists(full_path): - os.makedirs(full_path) - download_file(urljoin(settings.API_URL, "/api/download"), file_data, os.path.join(full_path, filename)) + # Handle zip files + if filename.endswith('.zip'): + logging.info(f"Extracting zip file: {filename}") + extract_zip_recursive( + temp_file_path, + temp_dir, + current_depth=0, + max_depth=RECURSION_DEPTH + ) - # check if file is .zip and extract it - if filename.endswith(".zip"): - extract_zip_recursive( - os.path.join(full_path, filename), full_path, 0, RECURSION_DEPTH - ) + if sample: + logging.info(f"Sample mode enabled. Using {limit} documents.") - self.update_state(state="PROGRESS", meta={"current": 1}) + reader = SimpleDirectoryReader( + input_dir=temp_dir, + input_files=input_files, + recursive=recursive, + required_exts=formats, + exclude_hidden=exclude, + file_metadata=metadata_from_filename, + ) + raw_docs = reader.load_data() - raw_docs = SimpleDirectoryReader( - input_dir=full_path, - input_files=input_files, - recursive=recursive, - required_exts=formats, - num_files_limit=limit, - exclude_hidden=exclude, - file_metadata=metadata_from_filename, - ).load_data() + chunker = Chunker( + chunking_strategy="classic_chunk", + max_tokens=MAX_TOKENS, + min_tokens=MIN_TOKENS, + duplicate_headers=False + ) + raw_docs = chunker.chunk(documents=raw_docs) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + id = ObjectId() + + vector_store_path = os.path.join(temp_dir, 'vector_store') + os.makedirs(vector_store_path, exist_ok=True) + + embed_and_store_documents(docs, vector_store_path, id, self) + + tokens = count_tokens_docs(docs) + + self.update_state(state="PROGRESS", meta={"current": 100}) - chunker = Chunker( - chunking_strategy="classic_chunk", - max_tokens=MAX_TOKENS, - min_tokens=MIN_TOKENS, - duplicate_headers=False - ) - raw_docs = chunker.chunk(documents=raw_docs) + if sample: + for i in range(min(5, len(raw_docs))): + logging.info(f"Sample document {i}: {raw_docs[i]}") + file_data = { + "name": name_job, + "file": filename, + "user": user, + "tokens": tokens, + "retriever": retriever, + "id": str(id), + "type": "local", + } - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - id = ObjectId() - embed_and_store_documents(docs, full_path, id, self) - tokens = count_tokens_docs(docs) - self.update_state(state="PROGRESS", meta={"current": 100}) + upload_index(vector_store_path, file_data) - if sample: - for i in range(min(5, len(raw_docs))): - logging.info(f"Sample document {i}: {raw_docs[i]}") - - file_data.update({ - "tokens": tokens, - "retriever": retriever, - "id": str(id), - "type": "local", - }) - upload_index(full_path, file_data) - - # delete local - shutil.rmtree(full_path) + except Exception as e: + logging.error(f"Error in ingest_worker: {e}", exc_info=True) + raise return { "directory": directory, From 0ce27f274ab806c457ee75a666f9e7a776c37854 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Apr 2025 04:28:45 +0530 Subject: [PATCH 21/39] (feat:storage) file indexes/faiss --- application/api/internal/routes.py | 23 ++++++++++--------- application/vectorstore/faiss.py | 36 ++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index c8e32d11..f0ad042f 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -3,10 +3,13 @@ import datetime from flask import Blueprint, request, send_from_directory from werkzeug.utils import secure_filename from bson.objectid import ObjectId - +import logging from application.core.mongo_db import MongoDB from application.core.settings import settings +from application.storage.storage_creator import StorageCreator + +logger = logging.getLogger(__name__) mongo = MongoDB.get_client() db = mongo["docsgpt"] conversations_collection = db["conversations"] @@ -45,26 +48,26 @@ def upload_index_files(): remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = secure_filename(request.form["sync_frequency"]) if "sync_frequency" in request.form else None - save_dir = os.path.join(current_dir, "indexes", str(id)) + storage = StorageCreator.get_storage() + index_base_path = f"indexes/{id}" + if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: - print("No file part") + logger.error("No file_faiss part") return {"status": "no file"} file_faiss = request.files["file_faiss"] if file_faiss.filename == "": return {"status": "no file name"} if "file_pkl" not in request.files: - print("No file part") + logger.error("No file_pkl part") return {"status": "no file"} file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} - # saves index files - - if not os.path.exists(save_dir): - os.makedirs(save_dir) - file_faiss.save(os.path.join(save_dir, "index.faiss")) - file_pkl.save(os.path.join(save_dir, "index.pkl")) + + # Save index files to storage + storage.save_file(file_faiss, f"{index_base_path}/index.faiss") + storage.save_file(file_pkl, f"{index_base_path}/index.pkl") existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 87ffcccb..ce455bd8 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,17 +1,19 @@ import os +import tempfile from langchain_community.vectorstores import FAISS from application.core.settings import settings from application.parser.schema.base import Document from application.vectorstore.base import BaseVectorStore +from application.storage.storage_creator import StorageCreator def get_vectorstore(path: str) -> str: if path: - vectorstore = os.path.join("application", "indexes", path) + vectorstore = f"indexes/{path}" else: - vectorstore = os.path.join("application") + vectorstore = "indexes" return vectorstore @@ -21,16 +23,36 @@ class FaissStore(BaseVectorStore): self.source_id = source_id self.path = get_vectorstore(source_id) self.embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + self.storage = StorageCreator.get_storage() try: if docs_init: self.docsearch = FAISS.from_documents(docs_init, self.embeddings) else: - self.docsearch = FAISS.load_local( - self.path, self.embeddings, allow_dangerous_deserialization=True - ) - except Exception: - raise + with tempfile.TemporaryDirectory() as temp_dir: + faiss_path = f"{self.path}/index.faiss" + pkl_path = f"{self.path}/index.pkl" + + if not self.storage.file_exists(faiss_path) or not self.storage.file_exists(pkl_path): + raise FileNotFoundError(f"Index files not found in storage at {self.path}") + + faiss_file = self.storage.get_file(faiss_path) + pkl_file = self.storage.get_file(pkl_path) + + local_faiss_path = os.path.join(temp_dir, "index.faiss") + local_pkl_path = os.path.join(temp_dir, "index.pkl") + + with open(local_faiss_path, 'wb') as f: + f.write(faiss_file.read()) + + with open(local_pkl_path, 'wb') as f: + f.write(pkl_file.read()) + + self.docsearch = FAISS.load_local( + temp_dir, self.embeddings, allow_dangerous_deserialization=True + ) + except Exception as e: + raise Exception(f"Error loading FAISS index: {str(e)}") self.assert_embedding_dimensions(self.embeddings) From 3cd9a72495533cc738e400ba9dc74db94f304cc9 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 23 Apr 2025 23:13:39 +0100 Subject: [PATCH 22/39] add storage type to the settings cofig --- application/core/settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/application/core/settings.py b/application/core/settings.py index 74bffe53..c3c5159e 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -98,6 +98,8 @@ class Settings(BaseSettings): BRAVE_SEARCH_API_KEY: Optional[str] = None FLASK_DEBUG_MODE: bool = False + STORAGE_TYPE: str = "local" # local or s3 + JWT_SECRET_KEY: str = "" From 76fd6e15cc2e84071e68d893da1304cf38a3cef4 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 24 Apr 2025 18:54:58 +0300 Subject: [PATCH 23/39] Update Dockerfile --- application/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/Dockerfile b/application/Dockerfile index 308b721b..e33721a2 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -84,4 +84,4 @@ EXPOSE 7091 USER appuser # Start Gunicorn -CMD ["gunicorn", "-w", "2", "--timeout", "120", "--bind", "0.0.0.0:7091", "application.wsgi:app"] \ No newline at end of file +CMD ["gunicorn", "-w", "1", "--timeout", "120", "--bind", "0.0.0.0:7091", "--preload", "application.wsgi:app"] From df9d432d29c1bbdf28abb3d35d129060b1964dd3 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 24 Apr 2025 17:29:41 +0100 Subject: [PATCH 24/39] fix: mongo db database name in settings --- application/agents/base.py | 5 +++-- application/api/answer/routes.py | 2 +- application/api/internal/routes.py | 2 +- application/api/user/routes.py | 2 +- application/core/settings.py | 1 + application/llm/google_ai.py | 3 ++- application/llm/openai.py | 2 +- application/logging.py | 3 ++- application/usage.py | 3 ++- application/worker.py | 4 ++-- 10 files changed, 16 insertions(+), 11 deletions(-) diff --git a/application/agents/base.py b/application/agents/base.py index 64fac17b..e4b76ca1 100644 --- a/application/agents/base.py +++ b/application/agents/base.py @@ -10,6 +10,7 @@ from application.core.mongo_db import MongoDB from application.llm.llm_creator import LLMCreator from application.logging import build_stack_data, log_activity, LogContext from application.retriever.base import BaseRetriever +from application.core.settings import settings from bson.objectid import ObjectId @@ -61,7 +62,7 @@ class BaseAgent(ABC): def _get_tools(self, api_key: str = None) -> Dict[str, Dict]: mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] agents_collection = db["agents"] tools_collection = db["user_tools"] @@ -82,7 +83,7 @@ class BaseAgent(ABC): def _get_user_tools(self, user="local"): mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] user_tools_collection = db["user_tools"] user_tools = user_tools_collection.find({"user": user, "status": True}) user_tools = list(user_tools) diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py index 8f44385b..2a8476d8 100644 --- a/application/api/answer/routes.py +++ b/application/api/answer/routes.py @@ -23,7 +23,7 @@ from application.utils import check_required_fields, limit_chat_history logger = logging.getLogger(__name__) mongo = MongoDB.get_client() -db = mongo["docsgpt"] +db = mongo[settings.MONGO_DB_NAME] conversations_collection = db["conversations"] sources_collection = db["sources"] prompts_collection = db["prompts"] diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index f0ad042f..80759593 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -11,7 +11,7 @@ from application.storage.storage_creator import StorageCreator logger = logging.getLogger(__name__) mongo = MongoDB.get_client() -db = mongo["docsgpt"] +db = mongo[settings.MONGO_DB_NAME] conversations_collection = db["conversations"] sources_collection = db["sources"] diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 6b52a436..d9c41c8f 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -23,7 +23,7 @@ from application.utils import check_required_fields, validate_function_name from application.vectorstore.vector_creator import VectorCreator mongo = MongoDB.get_client() -db = mongo["docsgpt"] +db = mongo[settings.MONGO_DB_NAME] conversations_collection = db["conversations"] sources_collection = db["sources"] prompts_collection = db["prompts"] diff --git a/application/core/settings.py b/application/core/settings.py index c3c5159e..3be34242 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -19,6 +19,7 @@ class Settings(BaseSettings): CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" MONGO_URI: str = "mongodb://localhost:27017/docsgpt" + MONGO_DB_NAME: str = "docsgpt" MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf") DEFAULT_MAX_HISTORY: int = 150 MODEL_TOKEN_LIMITS: dict = { diff --git a/application/llm/google_ai.py b/application/llm/google_ai.py index 06dbbdfd..a56616d2 100644 --- a/application/llm/google_ai.py +++ b/application/llm/google_ai.py @@ -5,6 +5,7 @@ import json from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator +from application.core.settings import settings class GoogleLLM(BaseLLM): @@ -120,7 +121,7 @@ class GoogleLLM(BaseLLM): from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] if '_id' in attachment: attachments_collection.update_one( diff --git a/application/llm/openai.py b/application/llm/openai.py index e8df92dd..248fd7e2 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -291,7 +291,7 @@ class OpenAILLM(BaseLLM): from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] if '_id' in attachment: attachments_collection.update_one( diff --git a/application/logging.py b/application/logging.py index 1dd0d557..ed07f858 100644 --- a/application/logging.py +++ b/application/logging.py @@ -7,6 +7,7 @@ import uuid from typing import Any, Callable, Dict, Generator, List from application.core.mongo_db import MongoDB +from application.core.settings import settings logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" @@ -131,7 +132,7 @@ def _log_to_mongodb( ) -> None: try: mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] user_logs_collection = db["stack_logs"] log_entry = { diff --git a/application/usage.py b/application/usage.py index 85328c1f..46620fff 100644 --- a/application/usage.py +++ b/application/usage.py @@ -2,10 +2,11 @@ import sys from datetime import datetime from application.core.mongo_db import MongoDB +from application.core.settings import settings from application.utils import num_tokens_from_object_or_list, num_tokens_from_string mongo = MongoDB.get_client() -db = mongo["docsgpt"] +db = mongo[settings.MONGO_DB_NAME] usage_collection = db["token_usage"] diff --git a/application/worker.py b/application/worker.py index d83639d7..3f542b6a 100755 --- a/application/worker.py +++ b/application/worker.py @@ -26,7 +26,7 @@ from application.parser.chunking import Chunker from application.utils import count_tokens_docs mongo = MongoDB.get_client() -db = mongo["docsgpt"] +db = mongo[settings.MONGO_DB_NAME] sources_collection = db["sources"] # Constants @@ -356,7 +356,7 @@ def attachment_worker(self, file_info, user): """ mongo = MongoDB.get_client() - db = mongo["docsgpt"] + db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] filename = file_info["filename"] From 8289b02ab0d533d45bf9dcf6a1af2dea4b003984 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Sat, 26 Apr 2025 12:00:29 +0530 Subject: [PATCH 25/39] feat: add agent webhook endpoint and implement related functionality --- application/api/user/routes.py | 90 +++++++- application/api/user/tasks.py | 14 +- application/worker.py | 247 +++++++++++++++++----- frontend/src/Navigation.tsx | 47 ++-- frontend/src/agents/AgentPreview.tsx | 1 + frontend/src/agents/NewAgent.tsx | 11 +- frontend/src/agents/index.tsx | 29 ++- frontend/src/api/endpoints.ts | 1 + frontend/src/api/services/userService.ts | 2 + frontend/src/components/MessageInput.tsx | 13 +- frontend/src/modals/AgentDetailsModal.tsx | 59 +++++- frontend/src/modals/ConfirmationModal.tsx | 10 +- 12 files changed, 424 insertions(+), 100 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 8876be6b..391444fc 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -4,6 +4,7 @@ import math import os import shutil import uuid +import secrets from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef @@ -14,7 +15,12 @@ from werkzeug.utils import secure_filename from application.agents.tools.tool_manager import ToolManager -from application.api.user.tasks import ingest, ingest_remote, store_attachment +from application.api.user.tasks import ( + ingest, + ingest_remote, + store_attachment, + process_agent_webhook, +) from application.core.mongo_db import MongoDB from application.core.settings import settings from application.extensions import api @@ -1329,6 +1335,88 @@ class DeleteAgent(Resource): return make_response(jsonify({"id": deleted_id}), 200) +@user_ns.route("/api/agent_webhook") +class AgentWebhook(Resource): + @api.doc( + params={"id": "ID of the agent"}, + description="Generate webhook URL for the agent", + ) + def get(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + user = decoded_token.get("sub") + agent_id = request.args.get("id") + if not agent_id: + return make_response( + jsonify({"success": False, "message": "ID is required"}), 400 + ) + + try: + agent = agents_collection.find_one( + {"_id": ObjectId(agent_id), "user": user} + ) + if not agent: + return make_response( + jsonify({"success": False, "message": "Agent not found"}), 404 + ) + + webhook_token = agent.get("incoming_webhook_token") + if not webhook_token: + webhook_token = secrets.token_urlsafe(32) + agents_collection.update_one( + {"_id": ObjectId(agent_id), "user": user}, + {"$set": {"incoming_webhook_token": webhook_token}}, + ) + base_url = settings.API_URL.rstrip("/") + full_webhook_url = f"{base_url}/api/webhooks/agents/{webhook_token}" + + except Exception as err: + current_app.logger.error(f"Error generating webhook URL: {err}") + return make_response( + jsonify({"success": False, "message": "Error generating webhook URL"}), + 400, + ) + return make_response( + jsonify({"success": True, "webhook_url": full_webhook_url}), 200 + ) + + +@user_ns.route(f"/api/webhooks/agents/") +class AgentWebhookListener(Resource): + @api.doc(description="Webhook listener for agent events") + def post(self, webhook_token): + agent = agents_collection.find_one( + {"incoming_webhook_token": webhook_token}, {"_id": 1} + ) + if not agent: + return make_response( + jsonify({"success": False, "message": "Agent not found"}), 404 + ) + data = request.get_json() + if not data: + return make_response( + jsonify({"success": False, "message": "No data provided"}), 400 + ) + + agent_id_str = str(agent["_id"]) + current_app.logger.info( + f"Incoming webhook received for agent {agent_id_str}. Enqueuing task." + ) + + try: + task = process_agent_webhook.delay( + agent_id=agent_id_str, + payload=data, + ) + except Exception as err: + current_app.logger.error(f"Error processing webhook: {err}") + return make_response( + jsonify({"success": False, "message": "Error processing webhook"}), 400 + ) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) + + @user_ns.route("/api/share") class ShareConversation(Resource): share_conversation_model = api.model( diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index 24cff3c6..f53d856b 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -1,7 +1,13 @@ from datetime import timedelta from application.celery_init import celery -from application.worker import ingest_worker, remote_worker, sync_worker, attachment_worker +from application.worker import ( + agent_webhook_worker, + attachment_worker, + ingest_worker, + remote_worker, + sync_worker, +) @celery.task(bind=True) @@ -28,6 +34,12 @@ def store_attachment(self, directory, saved_files, user): return resp +@celery.task(bind=True) +def process_agent_webhook(self, agent_id, payload): + resp = agent_webhook_worker(self, agent_id, payload) + return resp + + @celery.on_after_configure.connect def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task( diff --git a/application/worker.py b/application/worker.py index bbd422ac..4782a83b 100755 --- a/application/worker.py +++ b/application/worker.py @@ -1,3 +1,4 @@ +import json import logging import os import shutil @@ -7,15 +8,20 @@ from collections import Counter from urllib.parse import urljoin import requests +from bson.dbref import DBRef from bson.objectid import ObjectId +from application.agents.agent_creator import AgentCreator +from application.api.answer.routes import get_prompt + from application.core.mongo_db import MongoDB from application.core.settings import settings -from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.chunking import Chunker from application.parser.embedding_pipeline import embed_and_store_documents +from application.parser.file.bulk import SimpleDirectoryReader from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document -from application.parser.chunking import Chunker +from application.retriever.retriever_creator import RetrieverCreator from application.utils import count_tokens_docs mongo = MongoDB.get_client() @@ -27,18 +33,22 @@ MIN_TOKENS = 150 MAX_TOKENS = 1250 RECURSION_DEPTH = 2 + # Define a function to extract metadata from a given filename. def metadata_from_filename(title): return {"title": title} + # Define a function to generate a random string of a given length. def generate_random_string(length): return "".join([string.ascii_letters[i % 52] for i in range(length)]) + current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) + def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): """ Recursively extract zip files with a limit on recursion depth. @@ -69,6 +79,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): file_path = os.path.join(root, file) extract_zip_recursive(file_path, root, current_depth + 1, max_depth) + def download_file(url, params, dest_path): try: response = requests.get(url, params=params) @@ -79,6 +90,7 @@ def download_file(url, params, dest_path): logging.error(f"Error downloading file: {e}") raise + def upload_index(full_path, file_data): try: if settings.VECTOR_STORE == "faiss": @@ -87,7 +99,9 @@ def upload_index(full_path, file_data): "file_pkl": open(full_path + "/index.pkl", "rb"), } response = requests.post( - urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data + urljoin(settings.API_URL, "/api/upload_index"), + files=files, + data=file_data, ) else: response = requests.post( @@ -102,6 +116,75 @@ def upload_index(full_path, file_data): for file in files.values(): file.close() + +def run_agent_logic(agent_config, input_data): + try: + source = agent_config.get("source") + retriever = agent_config.get("retriever", "classic") + if isinstance(source, DBRef): + source_doc = db.dereference(source) + source = str(source_doc["_id"]) + retriever = source_doc.get("retriever", agent_config.get("retriever")) + else: + source = {} + source = {"active_docs": source} + chunks = int(agent_config.get("chunks", 2)) + prompt_id = agent_config.get("prompt_id", "default") + user_api_key = agent_config["key"] + agent_type = agent_config.get("agent_type", "classic") + decoded_token = {"sub": agent_config.get("user")} + prompt = get_prompt(prompt_id) + agent = AgentCreator.create_agent( + agent_type, + endpoint="webhook", + llm_name=settings.LLM_NAME, + gpt_model=settings.MODEL_NAME, + api_key=settings.API_KEY, + user_api_key=user_api_key, + prompt=prompt, + chat_history=[], + decoded_token=decoded_token, + attachments=[], + ) + retriever = RetrieverCreator.create_retriever( + retriever, + source=source, + chat_history=[], + prompt=prompt, + chunks=chunks, + token_limit=settings.DEFAULT_MAX_HISTORY, + gpt_model=settings.MODEL_NAME, + user_api_key=user_api_key, + decoded_token=decoded_token, + ) + answer = agent.gen(query=input_data, retriever=retriever) + response_full = "" + thought = "" + source_log_docs = [] + tool_calls = [] + + for line in answer: + if "answer" in line: + response_full += str(line["answer"]) + elif "sources" in line: + source_log_docs.extend(line["sources"]) + elif "tool_calls" in line: + tool_calls.extend(line["tool_calls"]) + elif "thought" in line: + thought += line["thought"] + + result = { + "answer": response_full, + "sources": source_log_docs, + "tool_calls": tool_calls, + "thought": thought, + } + return result + except Exception as e: + logging.error(f"Error in run_agent_logic: {e}", exc_info=True) + raise + + # Define the main function for ingesting and processing documents. def ingest_worker( self, directory, formats, name_job, filename, user, retriever="classic" @@ -133,7 +216,11 @@ def ingest_worker( if not os.path.exists(full_path): os.makedirs(full_path) - download_file(urljoin(settings.API_URL, "/api/download"), file_data, os.path.join(full_path, filename)) + download_file( + urljoin(settings.API_URL, "/api/download"), + file_data, + os.path.join(full_path, filename), + ) # check if file is .zip and extract it if filename.endswith(".zip"): @@ -157,7 +244,7 @@ def ingest_worker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, - duplicate_headers=False + duplicate_headers=False, ) raw_docs = chunker.chunk(documents=raw_docs) @@ -172,12 +259,14 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") - file_data.update({ - "tokens": tokens, - "retriever": retriever, - "id": str(id), - "type": "local", - }) + file_data.update( + { + "tokens": tokens, + "retriever": retriever, + "id": str(id), + "type": "local", + } + ) upload_index(full_path, file_data) # delete local @@ -192,6 +281,7 @@ def ingest_worker( "limited": False, } + def remote_worker( self, source_data, @@ -203,7 +293,7 @@ def remote_worker( sync_frequency="never", operation_mode="upload", doc_id=None, -): +): full_path = os.path.join(directory, user, name_job) if not os.path.exists(full_path): os.makedirs(full_path) @@ -218,7 +308,7 @@ def remote_worker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, - duplicate_headers=False + duplicate_headers=False, ) docs = chunker.chunk(documents=raw_docs) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] @@ -260,6 +350,7 @@ def remote_worker( logging.info("remote_worker task completed successfully") return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} + def sync( self, source_data, @@ -289,6 +380,7 @@ def sync( return {"status": "error", "error": str(e)} return {"status": "success"} + def sync_worker(self, frequency): sync_counts = Counter() sources = sources_collection.find() @@ -313,84 +405,137 @@ def sync_worker(self, frequency): for key in ["total_sync_count", "sync_success", "sync_failure"] } + def attachment_worker(self, directory, file_info, user): """ Process and store a single attachment without vectorization. - + Args: self: Reference to the instance of the task. directory (str): Base directory for storing files. file_info (dict): Dictionary with folder and filename info. user (str): User identifier. - + Returns: dict: Information about processed attachment. """ import datetime - import os import mimetypes + import os + from application.utils import num_tokens_from_string - + mongo = MongoDB.get_client() db = mongo["docsgpt"] attachments_collection = db["attachments"] - + filename = file_info["filename"] attachment_id = file_info["attachment_id"] - - logging.info(f"Processing attachment: {attachment_id}/{filename}", extra={"user": user}) - + + logging.info( + f"Processing attachment: {attachment_id}/{filename}", extra={"user": user} + ) + self.update_state(state="PROGRESS", meta={"current": 10}) - + file_path = os.path.join(directory, filename) - + if not os.path.exists(file_path): logging.warning(f"File not found: {file_path}", extra={"user": user}) raise FileNotFoundError(f"File not found: {file_path}") - + try: - reader = SimpleDirectoryReader( - input_files=[file_path] - ) + reader = SimpleDirectoryReader(input_files=[file_path]) documents = reader.load_data() - + self.update_state(state="PROGRESS", meta={"current": 50}) - + if documents: content = documents[0].text token_count = num_tokens_from_string(content) - + file_path_relative = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{attachment_id}/{filename}" - - mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream' - + + mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream" + doc_id = ObjectId(attachment_id) - attachments_collection.insert_one({ - "_id": doc_id, - "user": user, - "path": file_path_relative, - "content": content, - "token_count": token_count, - "mime_type": mime_type, - "date": datetime.datetime.now(), - }) - - logging.info(f"Stored attachment with ID: {attachment_id}", - extra={"user": user}) - + attachments_collection.insert_one( + { + "_id": doc_id, + "user": user, + "path": file_path_relative, + "content": content, + "token_count": token_count, + "mime_type": mime_type, + "date": datetime.datetime.now(), + } + ) + + logging.info( + f"Stored attachment with ID: {attachment_id}", extra={"user": user} + ) + self.update_state(state="PROGRESS", meta={"current": 100}) - + return { "filename": filename, "path": file_path_relative, "token_count": token_count, "attachment_id": attachment_id, - "mime_type": mime_type + "mime_type": mime_type, } else: - logging.warning("No content was extracted from the file", - extra={"user": user}) + logging.warning( + "No content was extracted from the file", extra={"user": user} + ) raise ValueError("No content was extracted from the file") except Exception as e: - logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True) + logging.error( + f"Error processing file {filename}: {e}", + extra={"user": user}, + exc_info=True, + ) raise + + +def agent_webhook_worker(self, agent_id, payload): + """ + Process the webhook payload for an agent. + + Args: + self: Reference to the instance of the task. + agent_id (str): Unique identifier for the agent. + payload (dict): The payload data from the webhook. + + Returns: + dict: Information about the processed webhook. + """ + mongo = MongoDB.get_client() + db = mongo["docsgpt"] + agents_collection = db["agents"] + + self.update_state(state="PROGRESS", meta={"current": 1}) + try: + agent_oid = ObjectId(agent_id) + agent_config = agents_collection.find_one({"_id": agent_oid}) + if not agent_config: + raise ValueError(f"Agent with ID {agent_id} not found.") + input_data = payload.get("query", "") + if input_data is None or not isinstance(input_data, str): + input_data = json.dumps(payload) + except Exception as e: + logging.error(f"Error processing agent webhook: {e}", exc_info=True) + return {"status": "error", "error": str(e)} + + self.update_state(state="PROGRESS", meta={"current": 50}) + try: + result = run_agent_logic(agent_config, input_data) + except Exception as e: + logging.error(f"Error running agent logic: {e}", exc_info=True) + return {"status": "error", "error": str(e)} + finally: + self.update_state(state="PROGRESS", meta={"current": 100}) + logging.info( + f"Webhook processed for agent {agent_id}", extra={"agent_id": agent_id} + ) + return {"status": "success", "result": result} diff --git a/frontend/src/Navigation.tsx b/frontend/src/Navigation.tsx index 0e357a6d..53487dd6 100644 --- a/frontend/src/Navigation.tsx +++ b/frontend/src/Navigation.tsx @@ -44,6 +44,7 @@ import { setModalStateDeleteConv, setSelectedAgent, setAgents, + selectAgents, } from './preferences/preferenceSlice'; import Upload from './upload/Upload'; @@ -63,6 +64,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { const conversations = useSelector(selectConversations); const conversationId = useSelector(selectConversationId); const modalStateDeleteConv = useSelector(selectModalStateDeleteConv); + const agents = useSelector(selectAgents); const selectedAgent = useSelector(selectSelectedAgent); const { isMobile } = useMediaQuery(); @@ -76,6 +78,31 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { const navRef = useRef(null); + async function fetchRecentAgents() { + try { + let recentAgents: Agent[] = []; + if (!agents) { + const response = await userService.getAgents(token); + if (!response.ok) throw new Error('Failed to fetch agents'); + const data: Agent[] = await response.json(); + dispatch(setAgents(data)); + recentAgents = data; + } else recentAgents = agents; + setRecentAgents( + recentAgents + .filter((agent: Agent) => agent.status === 'published') + .sort( + (a: Agent, b: Agent) => + new Date(b.last_used_at ?? 0).getTime() - + new Date(a.last_used_at ?? 0).getTime(), + ) + .slice(0, 3), + ); + } catch (error) { + console.error('Failed to fetch recent agents: ', error); + } + } + async function fetchConversations() { dispatch(setConversations({ ...conversations, loading: true })); return await getConversations(token) @@ -88,25 +115,11 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { }); } - async function getAgents() { - const response = await userService.getAgents(token); - if (!response.ok) throw new Error('Failed to fetch agents'); - const data: Agent[] = await response.json(); - dispatch(setAgents(data)); - setRecentAgents( - data - .filter((agent: Agent) => agent.status === 'published') - .sort( - (a: Agent, b: Agent) => - new Date(b.last_used_at ?? 0).getTime() - - new Date(a.last_used_at ?? 0).getTime(), - ) - .slice(0, 3), - ); - } + useEffect(() => { + if (token) fetchRecentAgents(); + }, [agents, token, dispatch]); useEffect(() => { - if (recentAgents.length === 0) getAgents(); if (!conversations?.data) fetchConversations(); if (queries.length === 0) resetConversation(); }, [conversations?.data, dispatch]); diff --git a/frontend/src/agents/AgentPreview.tsx b/frontend/src/agents/AgentPreview.tsx index 5eaf10a9..621ac477 100644 --- a/frontend/src/agents/AgentPreview.tsx +++ b/frontend/src/agents/AgentPreview.tsx @@ -141,6 +141,7 @@ export default function AgentPreview() { loading={status === 'loading'} showSourceButton={selectedAgent ? false : true} showToolButton={selectedAgent ? false : true} + autoFocus={false} />

This is a preview of the agent. You can publish it to start using it diff --git a/frontend/src/agents/NewAgent.tsx b/frontend/src/agents/NewAgent.tsx index 37466a86..3aa1bf7d 100644 --- a/frontend/src/agents/NewAgent.tsx +++ b/frontend/src/agents/NewAgent.tsx @@ -155,9 +155,10 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) { const data = await response.json(); if (data.id) setAgent((prev) => ({ ...prev, id: data.id })); if (data.key) setAgent((prev) => ({ ...prev, key: data.key })); - if (effectiveMode === 'new') { - setAgentDetails('ACTIVE'); + if (effectiveMode === 'new' || effectiveMode === 'draft') { setEffectiveMode('edit'); + setAgent((prev) => ({ ...prev, status: 'published' })); + setAgentDetails('ACTIVE'); } }; @@ -408,7 +409,7 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) { agent.prompt_id ? prompts.filter( (prompt) => prompt.id === agent.prompt_id, - )[0].name || null + )[0]?.name || null : null } onSelect={(option: { label: string; value: string }) => @@ -532,7 +533,7 @@ function AgentPreviewArea() { const selectedAgent = useSelector(selectSelectedAgent); return (

- {selectedAgent?.id ? ( + {selectedAgent?.status === 'published' ? (
@@ -540,7 +541,7 @@ function AgentPreviewArea() {
{' '}

- Published agents can be previewd here + Published agents can be previewed here

)} diff --git a/frontend/src/agents/index.tsx b/frontend/src/agents/index.tsx index 49123cd6..0ceef669 100644 --- a/frontend/src/agents/index.tsx +++ b/frontend/src/agents/index.tsx @@ -12,7 +12,13 @@ import ThreeDots from '../assets/three-dots.svg'; import ContextMenu, { MenuOption } from '../components/ContextMenu'; import ConfirmationModal from '../modals/ConfirmationModal'; import { ActiveState } from '../models/misc'; -import { selectToken, setSelectedAgent } from '../preferences/preferenceSlice'; +import { + selectToken, + setSelectedAgent, + setAgents, + selectAgents, + selectSelectedAgent, +} from '../preferences/preferenceSlice'; import AgentLogs from './AgentLogs'; import NewAgent from './NewAgent'; import { Agent } from './types'; @@ -31,9 +37,12 @@ export default function Agents() { function AgentsList() { const navigate = useNavigate(); + const dispatch = useDispatch(); const token = useSelector(selectToken); + const agents = useSelector(selectAgents); + const selectedAgent = useSelector(selectSelectedAgent); - const [userAgents, setUserAgents] = useState([]); + const [userAgents, setUserAgents] = useState(agents || []); const [loading, setLoading] = useState(true); const getAgents = async () => { @@ -43,6 +52,7 @@ function AgentsList() { if (!response.ok) throw new Error('Failed to fetch agents'); const data = await response.json(); setUserAgents(data); + dispatch(setAgents(data)); setLoading(false); } catch (error) { console.error('Error:', error); @@ -52,6 +62,7 @@ function AgentsList() { useEffect(() => { getAgents(); + if (selectedAgent) dispatch(setSelectedAgent(null)); }, [token]); return (
@@ -62,6 +73,7 @@ function AgentsList() { Discover and create custom versions of DocsGPT that combine instructions, extra knowledge, and any combination of skills.

+ {/* Premade agents section */} {/*

Premade by DocsGPT @@ -200,8 +212,10 @@ function AgentCard({ ]; const handleClick = () => { - dispatch(setSelectedAgent(agent)); - navigate(`/`); + if (agent.status === 'published') { + dispatch(setSelectedAgent(agent)); + navigate(`/`); + } }; const handleDelete = async (agentId: string) => { @@ -214,8 +228,11 @@ function AgentCard({ }; return (
handleClick()} + className={`relative flex h-44 w-48 flex-col justify-between rounded-[1.2rem] bg-[#F6F6F6] px-6 py-5 hover:bg-[#ECECEC] dark:bg-[#383838] hover:dark:bg-[#383838]/80 ${agent.status === 'published' && 'cursor-pointer'}`} + onClick={(e) => { + e.stopPropagation(); + handleClick(); + }} >
`/api/update_agent/${agent_id}`, DELETE_AGENT: (id: string) => `/api/delete_agent?id=${id}`, + AGENT_WEBHOOK: (id: string) => `/api/agent_webhook?id=${id}`, PROMPTS: '/api/get_prompts', CREATE_PROMPT: '/api/create_prompt', DELETE_PROMPT: '/api/delete_prompt', diff --git a/frontend/src/api/services/userService.ts b/frontend/src/api/services/userService.ts index bbe20b10..4a0f45d8 100644 --- a/frontend/src/api/services/userService.ts +++ b/frontend/src/api/services/userService.ts @@ -31,6 +31,8 @@ const userService = { apiClient.put(endpoints.USER.UPDATE_AGENT(agent_id), data, token), deleteAgent: (id: string, token: string | null): Promise => apiClient.delete(endpoints.USER.DELETE_AGENT(id), token), + getAgentWebhook: (id: string, token: string | null): Promise => + apiClient.get(endpoints.USER.AGENT_WEBHOOK(id), token), getPrompts: (token: string | null): Promise => apiClient.get(endpoints.USER.PROMPTS, token), createPrompt: (data: any, token: string | null): Promise => diff --git a/frontend/src/components/MessageInput.tsx b/frontend/src/components/MessageInput.tsx index e7ef7f9d..60cd4b81 100644 --- a/frontend/src/components/MessageInput.tsx +++ b/frontend/src/components/MessageInput.tsx @@ -36,15 +36,7 @@ type MessageInputProps = { loading: boolean; showSourceButton?: boolean; showToolButton?: boolean; -}; - -type UploadState = { - taskId: string; - fileName: string; - progress: number; - attachment_id?: string; - token_count?: number; - status: 'uploading' | 'processing' | 'completed' | 'failed'; + autoFocus?: boolean; }; export default function MessageInput({ @@ -54,6 +46,7 @@ export default function MessageInput({ loading, showSourceButton = true, showToolButton = true, + autoFocus = true, }: MessageInputProps) { const { t } = useTranslation(); const [isDarkTheme] = useDarkTheme(); @@ -235,7 +228,7 @@ export default function MessageInput({ }; useEffect(() => { - inputRef.current?.focus(); + if (autoFocus) inputRef.current?.focus(); handleInput(); }, []); diff --git a/frontend/src/modals/AgentDetailsModal.tsx b/frontend/src/modals/AgentDetailsModal.tsx index 377dd7bd..c1a8c131 100644 --- a/frontend/src/modals/AgentDetailsModal.tsx +++ b/frontend/src/modals/AgentDetailsModal.tsx @@ -1,7 +1,12 @@ +import { useState } from 'react'; +import { useSelector } from 'react-redux'; + import { Agent } from '../agents/types'; import { ActiveState } from '../models/misc'; import WrapperModal from './WrapperModal'; -import { useNavigate } from 'react-router-dom'; +import userService from '../api/services/userService'; +import { selectToken } from '../preferences/preferenceSlice'; +import Spinner from '../components/Spinner'; type AgentDetailsModalProps = { agent: Agent; @@ -16,13 +21,41 @@ export default function AgentDetailsModal({ modalState, setModalState, }: AgentDetailsModalProps) { - const navigate = useNavigate(); + const token = useSelector(selectToken); + + const [publicLink, setPublicLink] = useState(null); + const [apiKey, setApiKey] = useState(null); + const [webhookUrl, setWebhookUrl] = useState(null); + const [loadingStates, setLoadingStates] = useState({ + publicLink: false, + apiKey: false, + webhook: false, + }); + + const setLoading = ( + key: 'publicLink' | 'apiKey' | 'webhook', + state: boolean, + ) => { + setLoadingStates((prev) => ({ ...prev, [key]: state })); + }; + + const handleGenerateWebhook = async () => { + setLoading('webhook', true); + const response = await userService.getAgentWebhook(agent.id ?? '', token); + if (!response.ok) { + setLoading('webhook', false); + return; + } + const data = await response.json(); + setWebhookUrl(data.webhook_url); + setLoading('webhook', false); + }; + if (modalState !== 'ACTIVE') return null; return ( { - // if (mode === 'new') navigate('/agents'); setModalState('INACTIVE'); }} > @@ -57,9 +90,23 @@ export default function AgentDetailsModal({

Webhooks

- + {webhookUrl ? ( +
+ + {webhookUrl} + + +
+ ) : ( + + )}

diff --git a/frontend/src/modals/ConfirmationModal.tsx b/frontend/src/modals/ConfirmationModal.tsx index 25f8c2da..28151736 100644 --- a/frontend/src/modals/ConfirmationModal.tsx +++ b/frontend/src/modals/ConfirmationModal.tsx @@ -40,19 +40,23 @@ export default function ConfirmationModal({ >
-

+

{message}

)} diff --git a/frontend/src/agents/index.tsx b/frontend/src/agents/index.tsx index 0ceef669..c2edb34a 100644 --- a/frontend/src/agents/index.tsx +++ b/frontend/src/agents/index.tsx @@ -138,6 +138,7 @@ function AgentsList() { )) @@ -160,9 +161,11 @@ function AgentsList() { function AgentCard({ agent, + agents, setUserAgents, }: { agent: Agent; + agents: Agent[]; setUserAgents: React.Dispatch>; }) { const navigate = useNavigate(); @@ -225,6 +228,7 @@ function AgentCard({ setUserAgents((prevAgents) => prevAgents.filter((prevAgent) => prevAgent.id !== data.id), ); + dispatch(setAgents(agents.filter((prevAgent) => prevAgent.id !== data.id))); }; return (
+ + diff --git a/frontend/src/assets/monitoring-white.svg b/frontend/src/assets/monitoring-white.svg new file mode 100644 index 00000000..b015eeee --- /dev/null +++ b/frontend/src/assets/monitoring-white.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/components/CopyButton.tsx b/frontend/src/components/CopyButton.tsx index c430603f..0afbbe82 100644 --- a/frontend/src/components/CopyButton.tsx +++ b/frontend/src/components/CopyButton.tsx @@ -1,58 +1,136 @@ +import clsx from 'clsx'; import copy from 'copy-to-clipboard'; -import { useState } from 'react'; +import { useCallback, useEffect, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import CheckMark from '../assets/checkmark.svg?react'; -import Copy from '../assets/copy.svg?react'; +import CopyIcon from '../assets/copy.svg?react'; + +type CopyButtonProps = { + textToCopy: string; + bgColorLight?: string; + bgColorDark?: string; + hoverBgColorLight?: string; + hoverBgColorDark?: string; + iconSize?: string; + padding?: string; + showText?: boolean; + copiedDuration?: number; + className?: string; + iconWrapperClassName?: string; + textClassName?: string; +}; + +const DEFAULT_ICON_SIZE = 'w-4 h-4'; +const DEFAULT_PADDING = 'p-2'; +const DEFAULT_COPIED_DURATION = 2000; +const DEFAULT_BG_LIGHT = '#FFFFFF'; +const DEFAULT_BG_DARK = 'transparent'; +const DEFAULT_HOVER_BG_LIGHT = '#EEEEEE'; +const DEFAULT_HOVER_BG_DARK = '#4A4A4A'; export default function CopyButton({ - text, - colorLight, - colorDark, + textToCopy, + bgColorLight = DEFAULT_BG_LIGHT, + bgColorDark = DEFAULT_BG_DARK, + hoverBgColorLight = DEFAULT_HOVER_BG_LIGHT, + hoverBgColorDark = DEFAULT_HOVER_BG_DARK, + iconSize = DEFAULT_ICON_SIZE, + padding = DEFAULT_PADDING, showText = false, -}: { - text: string; - colorLight?: string; - colorDark?: string; - showText?: boolean; -}) { + copiedDuration = DEFAULT_COPIED_DURATION, + className, + iconWrapperClassName, + textClassName, +}: CopyButtonProps) { const { t } = useTranslation(); - const [copied, setCopied] = useState(false); - const [isCopyHovered, setIsCopyHovered] = useState(false); + const [isCopied, setIsCopied] = useState(false); + const timeoutIdRef = useRef(null); - const handleCopyClick = (text: string) => { - copy(text); - setCopied(true); - setTimeout(() => { - setCopied(false); - }, 3000); - }; + const iconWrapperClasses = clsx( + 'flex items-center justify-center rounded-full transition-colors duration-150 ease-in-out', + padding, + `bg-[${bgColorLight}] dark:bg-[${bgColorDark}]`, + `hover:bg-[${hoverBgColorLight}] dark:hover:bg-[${hoverBgColorDark}]`, + { + 'bg-green-100 dark:bg-green-900 hover:bg-green-100 dark:hover:bg-green-900': + isCopied, + }, + iconWrapperClassName, + ); + const rootButtonClasses = clsx( + 'flex items-center gap-2 group', + 'focus:outline-none focus-visible:ring-2 focus-visible:ring-offset-2 focus-visible:ring-blue-500 rounded-full', + className, + ); + + const textSpanClasses = clsx( + 'text-xs text-gray-600 dark:text-gray-400 transition-opacity duration-150 ease-in-out', + { 'opacity-75': isCopied }, + textClassName, + ); + + const IconComponent = isCopied ? CheckMark : CopyIcon; + const iconClasses = clsx(iconSize, { + 'stroke-green-600 dark:stroke-green-400': isCopied, + 'fill-none text-gray-700 dark:text-gray-300': !isCopied, + }); + + const buttonTitle = isCopied + ? t('conversation.copied') + : t('conversation.copy'); + const displayedText = isCopied + ? t('conversation.copied') + : t('conversation.copy'); + + const handleCopy = useCallback(() => { + if (isCopied) return; + + try { + const success = copy(textToCopy); + if (success) { + setIsCopied(true); + + if (timeoutIdRef.current) { + clearTimeout(timeoutIdRef.current); + } + + timeoutIdRef.current = setTimeout(() => { + setIsCopied(false); + timeoutIdRef.current = null; + }, copiedDuration); + } else { + console.warn('Copy command failed.'); + } + } catch (error) { + console.error('Failed to copy text:', error); + } + }, [textToCopy, copiedDuration, isCopied]); + + useEffect(() => { + return () => { + if (timeoutIdRef.current) { + clearTimeout(timeoutIdRef.current); + } + }; + }, []); return ( ); } diff --git a/frontend/src/conversation/ConversationBubble.tsx b/frontend/src/conversation/ConversationBubble.tsx index a241b2d3..a7c8467d 100644 --- a/frontend/src/conversation/ConversationBubble.tsx +++ b/frontend/src/conversation/ConversationBubble.tsx @@ -5,10 +5,7 @@ import { useTranslation } from 'react-i18next'; import ReactMarkdown from 'react-markdown'; import { useSelector } from 'react-redux'; import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; -import { - oneLight, - vscDarkPlus, -} from 'react-syntax-highlighter/dist/cjs/styles/prism'; +import { oneLight, vscDarkPlus } from 'react-syntax-highlighter/dist/cjs/styles/prism'; import rehypeKatex from 'rehype-katex'; import remarkGfm from 'remark-gfm'; import remarkMath from 'remark-math'; @@ -29,10 +26,7 @@ import CopyButton from '../components/CopyButton'; import Sidebar from '../components/Sidebar'; import SpeakButton from '../components/TextToSpeechButton'; import { useDarkTheme, useOutsideAlerter } from '../hooks'; -import { - selectChunks, - selectSelectedDocs, -} from '../preferences/preferenceSlice'; +import { selectChunks, selectSelectedDocs } from '../preferences/preferenceSlice'; import classes from './ConversationBubble.module.css'; import { FEEDBACK, MESSAGE_TYPE } from './conversationModels'; import { ToolCallsType } from './types'; @@ -377,7 +371,7 @@ const ConversationBubble = forwardRef< {language}
- +
{' '}

@@ -689,7 +683,7 @@ function ToolCalls({ toolCalls }: { toolCalls: ToolCallsType[] }) { Response {' '}

@@ -766,7 +760,7 @@ function Thought({ {language}

-

- Webhooks -

+
+

+ Webhook URL +

+ {webhookUrl && ( +
+ +
+ )} +
{webhookUrl ? ( -
- +
+

{webhookUrl} - - +

) : ( )}
diff --git a/frontend/src/settings/Analytics.tsx b/frontend/src/settings/Analytics.tsx index 04bec5c2..535200ef 100644 --- a/frontend/src/settings/Analytics.tsx +++ b/frontend/src/settings/Analytics.tsx @@ -1,11 +1,5 @@ import { - BarElement, - CategoryScale, - Chart as ChartJS, - Legend, - LinearScale, - Title, - Tooltip, + BarElement, CategoryScale, Chart as ChartJS, Legend, LinearScale, Title, Tooltip } from 'chart.js'; import { useEffect, useState } from 'react'; import { Bar } from 'react-chartjs-2'; @@ -71,7 +65,6 @@ export default function Analytics({ agentId }: AnalyticsProps) { string, { positive: number; negative: number } > | null>(null); - const [agent, setAgent] = useState(); const [messagesFilter, setMessagesFilter] = useState<{ label: string; value: string; @@ -97,21 +90,6 @@ export default function Analytics({ agentId }: AnalyticsProps) { const [loadingMessages, setLoadingMessages] = useLoaderState(true); const [loadingTokens, setLoadingTokens] = useLoaderState(true); const [loadingFeedback, setLoadingFeedback] = useLoaderState(true); - const [loadingAgent, setLoadingAgent] = useLoaderState(true); - - const fetchAgent = async (agentId: string) => { - setLoadingAgent(true); - try { - const response = await userService.getAgent(agentId ?? '', token); - if (!response.ok) throw new Error('Failed to fetch Chatbots'); - const agent = await response.json(); - setAgent(agent); - } catch (error) { - console.error(error); - } finally { - setLoadingAgent(false); - } - }; const fetchMessagesData = async (agent_id?: string, filter?: string) => { setLoadingMessages(true); @@ -174,27 +152,22 @@ export default function Analytics({ agentId }: AnalyticsProps) { }; useEffect(() => { - if (agentId) fetchAgent(agentId); - }, []); - - useEffect(() => { - const id = agent?.id; + const id = agentId; const filter = messagesFilter; fetchMessagesData(id, filter?.value); - }, [agent, messagesFilter]); + }, [agentId, messagesFilter]); useEffect(() => { - const id = agent?.id; + const id = agentId; const filter = tokenUsageFilter; fetchTokenData(id, filter?.value); - }, [agent, tokenUsageFilter]); + }, [agentId, tokenUsageFilter]); useEffect(() => { - const id = agent?.id; + const id = agentId; const filter = feedbackFilter; fetchFeedbackData(id, filter?.value); - }, [agent, feedbackFilter]); - + }, [agentId, feedbackFilter]); return (
{/* Messages Analytics */} diff --git a/frontend/src/settings/Logs.tsx b/frontend/src/settings/Logs.tsx index a14f5966..50d67b54 100644 --- a/frontend/src/settings/Logs.tsx +++ b/frontend/src/settings/Logs.tsx @@ -181,8 +181,7 @@ function Log({

From cc67d4a1e2034df2fb196d2b2d3e07beed8c5224 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 28 Apr 2025 17:49:29 +0100 Subject: [PATCH 27/39] process all request data implicitly --- application/worker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/application/worker.py b/application/worker.py index e7ac85a9..bebd88a6 100755 --- a/application/worker.py +++ b/application/worker.py @@ -547,9 +547,7 @@ def agent_webhook_worker(self, agent_id, payload): agent_config = agents_collection.find_one({"_id": agent_oid}) if not agent_config: raise ValueError(f"Agent with ID {agent_id} not found.") - input_data = payload.get("query", "") - if input_data is None or not isinstance(input_data, str): - input_data = json.dumps(payload) + input_data = json.dumps(payload) except Exception as e: logging.error(f"Error processing agent webhook: {e}", exc_info=True) return {"status": "error", "error": str(e)} From 22c7015c695f2053492b2bc43c9c595bccd01411 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Tue, 29 Apr 2025 00:29:16 +0530 Subject: [PATCH 28/39] refactor: webhook listener handle both POST and GET requests --- application/api/user/routes.py | 192 +++++++++++++++++++++++---------- 1 file changed, 134 insertions(+), 58 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 528a4c29..f8e40b24 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -2,9 +2,10 @@ import datetime import json import math import os +import secrets import shutil import uuid -import secrets +from functools import wraps from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef @@ -18,8 +19,8 @@ from application.agents.tools.tool_manager import ToolManager from application.api.user.tasks import ( ingest, ingest_remote, - store_attachment, process_agent_webhook, + store_attachment, ) from application.core.mongo_db import MongoDB from application.core.settings import settings @@ -419,13 +420,14 @@ class UploadFile(Resource): user = secure_filename(decoded_token.get("sub")) job_name = secure_filename(request.form["name"]) - + try: from application.storage.storage_creator import StorageCreator + storage = StorageCreator.get_storage() - + base_path = f"{settings.UPLOAD_FOLDER}/{user}/{job_name}" - + if len(files) > 1: temp_files = [] for file in files: @@ -434,41 +436,56 @@ class UploadFile(Resource): storage.save_file(file, temp_path) temp_files.append(temp_path) print(f"Saved file: {filename}") - + zip_filename = f"{job_name}.zip" zip_path = f"{base_path}/{zip_filename}" - + def create_zip_archive(temp_paths, **kwargs): import tempfile + with tempfile.TemporaryDirectory() as temp_dir: for path in temp_paths: file_data = storage.get_file(path) - with open(os.path.join(temp_dir, os.path.basename(path)), 'wb') as f: + with open( + os.path.join(temp_dir, os.path.basename(path)), "wb" + ) as f: f.write(file_data.read()) - + # Create zip archive zip_temp = shutil.make_archive( base_name=os.path.join(temp_dir, job_name), format="zip", - root_dir=temp_dir + root_dir=temp_dir, ) - + return zip_temp - + zip_temp_path = create_zip_archive(temp_files) - with open(zip_temp_path, 'rb') as zip_file: + with open(zip_temp_path, "rb") as zip_file: storage.save_file(zip_file, zip_path) - + # Clean up temp files for temp_path in temp_files: storage.delete_file(temp_path) - + task = ingest.delay( settings.UPLOAD_FOLDER, [ - ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", - ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", - ".jpg", ".jpeg", + ".rst", + ".md", + ".pdf", + ".txt", + ".docx", + ".csv", + ".epub", + ".html", + ".mdx", + ".json", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", ], job_name, zip_filename, @@ -479,15 +496,27 @@ class UploadFile(Resource): file = files[0] filename = secure_filename(file.filename) file_path = f"{base_path}/{filename}" - + storage.save_file(file, file_path) - + task = ingest.delay( settings.UPLOAD_FOLDER, [ - ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", - ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", - ".jpg", ".jpeg", + ".rst", + ".md", + ".pdf", + ".txt", + ".docx", + ".csv", + ".epub", + ".html", + ".mdx", + ".json", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", ], job_name, filename, @@ -497,7 +526,7 @@ class UploadFile(Resource): except Exception as err: current_app.logger.error(f"Error uploading file: {err}") return make_response(jsonify({"success": False}), 400) - + return make_response(jsonify({"success": True, "task_id": task.id}), 200) @@ -1386,39 +1415,88 @@ class AgentWebhook(Resource): ) -@user_ns.route(f"/api/webhooks/agents/") -class AgentWebhookListener(Resource): - @api.doc(description="Webhook listener for agent events") - def post(self, webhook_token): +def require_agent(func): + @wraps(func) + def wrapper(*args, **kwargs): + webhook_token = kwargs.get("webhook_token") + if not webhook_token: + return make_response( + jsonify({"success": False, "message": "Webhook token missing"}), 400 + ) + agent = agents_collection.find_one( {"incoming_webhook_token": webhook_token}, {"_id": 1} ) if not agent: + current_app.logger.warning( + f"Webhook attempt with invalid token: {webhook_token}" + ) return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) - data = request.get_json() - if not data: - return make_response( - jsonify({"success": False, "message": "No data provided"}), 400 + + kwargs["agent"] = agent + kwargs["agent_id_str"] = str(agent["_id"]) + return func(*args, **kwargs) + + return wrapper + + +@user_ns.route(f"/api/webhooks/agents/") +class AgentWebhookListener(Resource): + method_decorators = [require_agent] + + def _enqueue_webhook_task(self, agent_id_str, payload, source_method): + if not payload: + current_app.logger.warning( + f"Webhook ({source_method}) received for agent {agent_id_str} with empty payload." ) - agent_id_str = str(agent["_id"]) current_app.logger.info( - f"Incoming webhook received for agent {agent_id_str}. Enqueuing task." + f"Incoming {source_method} webhook for agent {agent_id_str}. Enqueuing task with payload: {payload}" ) try: task = process_agent_webhook.delay( agent_id=agent_id_str, - payload=data, + payload=payload, ) + current_app.logger.info( + f"Task {task.id} enqueued for agent {agent_id_str} ({source_method})." + ) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) except Exception as err: - current_app.logger.error(f"Error processing webhook: {err}") - return make_response( - jsonify({"success": False, "message": "Error processing webhook"}), 400 + current_app.logger.error( + f"Error enqueuing webhook task ({source_method}) for agent {agent_id_str}: {err}", + exc_info=True, ) - return make_response(jsonify({"success": True, "task_id": task.id}), 200) + return make_response( + jsonify({"success": False, "message": "Error processing webhook"}), 500 + ) + + @api.doc( + description="Webhook listener for agent events (POST). Expects JSON payload, which is used to trigger processing.", + ) + def post(self, webhook_token, agent, agent_id_str): + payload = request.get_json() + if payload is None: + return make_response( + jsonify( + { + "success": False, + "message": "Invalid or missing JSON data in request body", + } + ), + 400, + ) + return self._enqueue_webhook_task(agent_id_str, payload, source_method="POST") + + @api.doc( + description="Webhook listener for agent events (GET). Uses URL query parameters as payload to trigger processing.", + ) + def get(self, webhook_token, agent, agent_id_str): + payload = request.args.to_dict(flat=True) + return self._enqueue_webhook_task(agent_id_str, payload, source_method="GET") @user_ns.route("/api/share") @@ -2872,9 +2950,9 @@ class StoreAttachment(Resource): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) - + file = request.files.get("file") - + if not file or file.filename == "": return make_response( jsonify({"status": "error", "message": "Missing file"}), @@ -2882,35 +2960,33 @@ class StoreAttachment(Resource): ) user = secure_filename(decoded_token.get("sub")) - + try: attachment_id = ObjectId() original_filename = secure_filename(file.filename) relative_path = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{str(attachment_id)}/{original_filename}" - + file_content = file.read() - + file_info = { "filename": original_filename, "attachment_id": str(attachment_id), "path": relative_path, - "file_content": file_content + "file_content": file_content, } - - task = store_attachment.delay( - file_info, - user - ) - + + task = store_attachment.delay(file_info, user) + return make_response( - jsonify({ - "success": True, - "task_id": task.id, - "message": "File uploaded successfully. Processing started." - }), - 200 + jsonify( + { + "success": True, + "task_id": task.id, + "message": "File uploaded successfully. Processing started.", + } + ), + 200, ) except Exception as err: current_app.logger.error(f"Error storing attachment: {err}") return make_response(jsonify({"success": False, "error": str(err)}), 400) - From 330276cdf7363f4b9b0c2e0edb7dfa0b52cf73a7 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 28 Apr 2025 22:32:13 +0100 Subject: [PATCH 29/39] fix: lint for ruff --- application/api/user/routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index f8e40b24..d96d6202 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -1442,7 +1442,7 @@ def require_agent(func): return wrapper -@user_ns.route(f"/api/webhooks/agents/") +@user_ns.route("/api/webhooks/agents/") class AgentWebhookListener(Resource): method_decorators = [require_agent] From 4b2faae29aa1a619a371c66f09d062e9bedcf92e Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 1 May 2025 17:15:08 +0300 Subject: [PATCH 30/39] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 598cfa1d..bc4113ee 100644 --- a/README.md +++ b/README.md @@ -49,10 +49,10 @@ - [x] Manually updating chunks in the app UI (Feb 2025) - [x] Devcontainer for easy development (Feb 2025) - [x] ReACT agent (March 2025) -- [ ] Chatbots menu re-design to handle tools, agent types, and more (April 2025) -- [ ] New input box in the conversation menu (April 2025) -- [ ] Anthropic Tool compatibility (April 2025) -- [ ] Add triggerable actions / tools (webhook) (April 2025) +- [x] Chatbots menu re-design to handle tools, agent types, and more (April 2025) +- [x] New input box in the conversation menu (April 2025) +- [x] Add triggerable actions / tools (webhook) (April 2025) +- [ ] Anthropic Tool compatibility (May 2025) - [ ] Add OAuth 2.0 authentication for tools and sources - [ ] Agent scheduling From ae700e8f3ab9fc2655b8d0cd660b94d6ba0f7f18 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 4 May 2025 18:56:33 +0100 Subject: [PATCH 31/39] fix: display only 2 demos buttons on mobile --- frontend/src/Hero.tsx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/frontend/src/Hero.tsx b/frontend/src/Hero.tsx index 0161eac2..583f8a1b 100644 --- a/frontend/src/Hero.tsx +++ b/frontend/src/Hero.tsx @@ -38,9 +38,12 @@ export default function Hero({