Merge pull request #1873 from ManishMadan2882/main

Sources are the new Docs
2026-03-06 13:53:26 +00:00 · 2025-08-13 18:24:35 +01:00
parent cd0fbf79a3 d1a0fe6e91
commit 6a02bcf15b
40 changed files with 3412 additions and 1119 deletions
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -3,11 +3,11 @@ import json
 import math
 import os
 import secrets
-import shutil
 import uuid
 from functools import wraps
 from typing import Optional, Tuple
-
+import tempfile
+import zipfile
 from bson.binary import Binary, UuidRepresentation
 from bson.dbref import DBRef
 from bson.objectid import ObjectId
@@ -44,6 +44,7 @@ from application.utils import (
    validate_function_name,
    validate_required_fields,
 )
+from application.utils import num_tokens_from_string
 from application.vectorstore.vector_creator import VectorCreator

 storage = StorageCreator.get_storage()
@@ -474,7 +475,7 @@ class DeleteByIds(Resource):
@user_ns.route("/api/delete_old")
 class DeleteOldIndexes(Resource):
    @api.doc(
-        description="Deletes old indexes",
+        description="Deletes old indexes and associated files",
        params={"source_id": "The source ID to delete"},
    )
    def get(self):
@@ -491,21 +492,40 @@ class DeleteOldIndexes(Resource):
        )
        if not doc:
            return make_response(jsonify({"status": "not found"}), 404)
+        
+        storage = StorageCreator.get_storage()
+        
        try:
+            # Delete vector index
            if settings.VECTOR_STORE == "faiss":
-                shutil.rmtree(os.path.join(current_dir, "indexes", str(doc["_id"])))
+                index_path = f"indexes/{str(doc['_id'])}"
+                if storage.file_exists(f"{index_path}/index.faiss"):
+                    storage.delete_file(f"{index_path}/index.faiss")
+                if storage.file_exists(f"{index_path}/index.pkl"):
+                    storage.delete_file(f"{index_path}/index.pkl")
            else:
                vectorstore = VectorCreator.create_vectorstore(
                    settings.VECTOR_STORE, source_id=str(doc["_id"])
                )
                vectorstore.delete_index()
+                
+            if "file_path" in doc and doc["file_path"]:
+                file_path = doc["file_path"]
+                if storage.is_directory(file_path):
+                    files = storage.list_files(file_path)
+                    for f in files:
+                        storage.delete_file(f)
+                else:
+                    storage.delete_file(file_path)
+                    
        except FileNotFoundError:
            pass
        except Exception as err:
            current_app.logger.error(
-                f"Error deleting old indexes: {err}", exc_info=True
+                f"Error deleting files and indexes: {err}", exc_info=True
            )
            return make_response(jsonify({"success": False}), 400)
+            
        sources_collection.delete_one({"_id": ObjectId(source_id)})
        return make_response(jsonify({"success": True}), 200)

@@ -549,146 +569,276 @@ class UploadFile(Resource):
        # Create safe versions for filesystem operations
        safe_user = safe_filename(user)
        dir_name = safe_filename(job_name)
+        base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"

        try:
            storage = StorageCreator.get_storage()
-            base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
-
-            if len(files) > 1:
-                temp_files = []
-                for file in files:
-                    filename = safe_filename(file.filename)
-                    temp_path = f"{base_path}/temp/{filename}"
-                    storage.save_file(file, temp_path)
-                    temp_files.append(temp_path)
-                    print(f"Saved file: {filename}")
-                zip_filename = f"{dir_name}.zip"
-                zip_path = f"{base_path}/{zip_filename}"
-                zip_temp_path = None
-
-                def create_zip_archive(temp_paths, dir_name, storage):
-                    import tempfile
-
-                    with tempfile.NamedTemporaryFile(
-                        delete=False, suffix=".zip"
-                    ) as temp_zip_file:
-                        zip_output_path = temp_zip_file.name
-                    with tempfile.TemporaryDirectory() as stage_dir:
-                        for path in temp_paths:
-                            try:
-                                file_data = storage.get_file(path)
-                                with open(
-                                    os.path.join(stage_dir, os.path.basename(path)),
-                                    "wb",
-                                ) as f:
-                                    f.write(file_data.read())
-                            except Exception as e:
-                                current_app.logger.error(
-                                    f"Error processing file {path} for zipping: {e}",
-                                    exc_info=True,
-                                )
-                                if os.path.exists(zip_output_path):
-                                    os.remove(zip_output_path)
-                                raise
+            
+            
+            for file in files:
+                original_filename = file.filename
+                safe_file = safe_filename(original_filename)
+                
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_file_path = os.path.join(temp_dir, safe_file)
+                    file.save(temp_file_path)
+                    
+                    if zipfile.is_zipfile(temp_file_path):
                        try:
-                            shutil.make_archive(
-                                base_name=zip_output_path.replace(".zip", ""),
-                                format="zip",
-                                root_dir=stage_dir,
-                            )
+                            with zipfile.ZipFile(temp_file_path, 'r') as zip_ref:
+                                zip_ref.extractall(path=temp_dir)
+                                
+                                # Walk through extracted files and upload them
+                                for root, _, files in os.walk(temp_dir):
+                                    for extracted_file in files:
+                                        if os.path.join(root, extracted_file) == temp_file_path:
+                                            continue
+                                            
+                                        rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir)
+                                        storage_path = f"{base_path}/{rel_path}"
+                                        
+                                        with open(os.path.join(root, extracted_file), 'rb') as f:
+                                            storage.save_file(f, storage_path)
                        except Exception as e:
-                            current_app.logger.error(
-                                f"Error creating zip archive: {e}", exc_info=True
-                            )
-                            if os.path.exists(zip_output_path):
-                                os.remove(zip_output_path)
-                            raise
-                    return zip_output_path
-
-                try:
-                    zip_temp_path = create_zip_archive(temp_files, dir_name, storage)
-                    with open(zip_temp_path, "rb") as zip_file:
-                        storage.save_file(zip_file, zip_path)
-                    task = ingest.delay(
-                        settings.UPLOAD_FOLDER,
-                        [
-                            ".rst",
-                            ".md",
-                            ".pdf",
-                            ".txt",
-                            ".docx",
-                            ".csv",
-                            ".epub",
-                            ".html",
-                            ".mdx",
-                            ".json",
-                            ".xlsx",
-                            ".pptx",
-                            ".png",
-                            ".jpg",
-                            ".jpeg",
-                        ],
-                        job_name,
-                        zip_filename,
-                        user,
-                        dir_name,
-                        safe_user,
-                    )
-                finally:
-                    # Clean up temporary files
-
-                    for temp_path in temp_files:
-                        try:
-                            storage.delete_file(temp_path)
-                        except Exception as e:
-                            current_app.logger.error(
-                                f"Error deleting temporary file {temp_path}: {e}",
-                                exc_info=True,
-                            )
-                    # Clean up the zip file if it was created
-
-                    if zip_temp_path and os.path.exists(zip_temp_path):
-                        os.remove(zip_temp_path)
-            else:  # Keep this else block for single file upload
-                # For single file
-
-                file = files[0]
-                filename = safe_filename(file.filename)
-                file_path = f"{base_path}/{filename}"
-
-                storage.save_file(file, file_path)
-
-                task = ingest.delay(
-                    settings.UPLOAD_FOLDER,
-                    [
-                        ".rst",
-                        ".md",
-                        ".pdf",
-                        ".txt",
-                        ".docx",
-                        ".csv",
-                        ".epub",
-                        ".html",
-                        ".mdx",
-                        ".json",
-                        ".xlsx",
-                        ".pptx",
-                        ".png",
-                        ".jpg",
-                        ".jpeg",
-                    ],
-                    job_name,
-                    filename,  # Corrected variable for single-file case
-                    user,
-                    dir_name,
-                    safe_user,
-                )
+                            current_app.logger.error(f"Error extracting zip: {e}", exc_info=True)
+                            # If zip extraction fails, save the original zip file
+                            file_path = f"{base_path}/{safe_file}"
+                            with open(temp_file_path, 'rb') as f:
+                                storage.save_file(f, file_path)
+                    else:
+                        # For non-zip files, save directly
+                        file_path = f"{base_path}/{safe_file}"
+                        with open(temp_file_path, 'rb') as f:
+                            storage.save_file(f, file_path)
+            
+            task = ingest.delay(
+                settings.UPLOAD_FOLDER,
+                [
+                    ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub",
+                    ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png",
+                    ".jpg", ".jpeg",
+                ],
+                job_name,
+                user,
+                file_path=base_path,
+                filename=dir_name
+            )
        except Exception as err:
            current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
            return make_response(jsonify({"success": False}), 400)
        return make_response(jsonify({"success": True, "task_id": task.id}), 200)


+@user_ns.route("/api/manage_source_files")
+class ManageSourceFiles(Resource):
+    @api.expect(
+        api.model(
+            "ManageSourceFilesModel",
+            {
+                "source_id": fields.String(required=True, description="Source ID to modify"),
+                "operation": fields.String(required=True, description="Operation: 'add', 'remove', or 'remove_directory'"),
+                "file_paths": fields.List(fields.String, required=False, description="File paths to remove (for remove operation)"),
+                "directory_path": fields.String(required=False, description="Directory path to remove (for remove_directory operation)"),
+                "file": fields.Raw(required=False, description="Files to add (for add operation)"),
+                "parent_dir": fields.String(required=False, description="Parent directory path relative to source root"),
+            },
+        )
+    )
+    @api.doc(
+        description="Add files, remove files, or remove directories from an existing source",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False, "message": "Unauthorized"}), 401)
+
+        user = decoded_token.get("sub")
+        source_id = request.form.get("source_id")
+        operation = request.form.get("operation")
+
+        if not source_id or not operation:
+            return make_response(
+                jsonify({"success": False, "message": "source_id and operation are required"}), 400
+            )
+
+        if operation not in ["add", "remove", "remove_directory"]:
+            return make_response(
+                jsonify({"success": False, "message": "operation must be 'add', 'remove', or 'remove_directory'"}), 400
+            )
+
+        try:
+            ObjectId(source_id)
+        except Exception:
+            return make_response(
+                jsonify({"success": False, "message": "Invalid source ID format"}), 400
+            )
+
+        try:
+            source = sources_collection.find_one({"_id": ObjectId(source_id), "user": user})
+            if not source:
+                return make_response(
+                    jsonify({"success": False, "message": "Source not found or access denied"}), 404
+                )
+        except Exception as err:
+            current_app.logger.error(f"Error finding source: {err}", exc_info=True)
+            return make_response(jsonify({"success": False, "message": "Database error"}), 500)
+
+        try:
+            storage = StorageCreator.get_storage()
+            source_file_path = source.get("file_path", "")
+            parent_dir = request.form.get("parent_dir", "") 
+            
+            if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
+                return make_response(
+                    jsonify({"success": False, "message": "Invalid parent directory path"}), 400
+                )
+
+            if operation == "add":
+                files = request.files.getlist("file")
+                if not files or all(file.filename == "" for file in files):
+                    return make_response(
+                        jsonify({"success": False, "message": "No files provided for add operation"}), 400
+                    )
+
+                added_files = []
+                
+                target_dir = source_file_path
+                if parent_dir:
+                    target_dir = f"{source_file_path}/{parent_dir}"
+
+                for file in files:
+                    if file.filename:
+                        safe_filename_str = safe_filename(file.filename)
+                        file_path = f"{target_dir}/{safe_filename_str}"
+
+                        # Save file to storage
+                        storage.save_file(file, file_path)
+                        added_files.append(safe_filename_str)
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Added {len(added_files)} files",
+                    "added_files": added_files,
+                    "parent_dir": parent_dir,
+                    "reingest_task_id": task.id
+                }), 200)
+
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths")
+                if not file_paths_str:
+                    return make_response(
+                        jsonify({"success": False, "message": "file_paths required for remove operation"}), 400
+                    )
+
+                try:
+                    file_paths = json.loads(file_paths_str) if isinstance(file_paths_str, str) else file_paths_str
+                except Exception:
+                    return make_response(
+                        jsonify({"success": False, "message": "Invalid file_paths format"}), 400
+                    )
+
+                # Remove files from storage and directory structure
+                removed_files = []
+                for file_path in file_paths:
+                    full_path = f"{source_file_path}/{file_path}"
+
+                    # Remove from storage
+                    if storage.file_exists(full_path):
+                        storage.delete_file(full_path)
+                        removed_files.append(file_path)
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Removed {len(removed_files)} files",
+                    "removed_files": removed_files,
+                    "reingest_task_id": task.id
+                }), 200)
+
+            elif operation == "remove_directory":
+                directory_path = request.form.get("directory_path")
+                if not directory_path:
+                    return make_response(
+                        jsonify({"success": False, "message": "directory_path required for remove_directory operation"}), 400
+                    )
+
+                # Validate directory path (prevent path traversal)
+                if directory_path.startswith("/") or ".." in directory_path:
+                    current_app.logger.warning(
+                        f"Invalid directory path attempted for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Invalid directory path"}), 400
+                    )
+
+                full_directory_path = f"{source_file_path}/{directory_path}" if directory_path else source_file_path
+
+                if not storage.is_directory(full_directory_path):
+                    current_app.logger.warning(
+                        f"Directory not found or is not a directory for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Directory not found or is not a directory"}), 404
+                    )
+
+                success = storage.remove_directory(full_directory_path)
+
+                if not success:
+                    current_app.logger.error(
+                        f"Failed to remove directory from storage. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Failed to remove directory"}), 500
+                    )
+
+                current_app.logger.info(
+                    f"Successfully removed directory. "
+                    f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                    f"Full path: {full_directory_path}"
+                )
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Successfully removed directory: {directory_path}",
+                    "removed_directory": directory_path,
+                    "reingest_task_id": task.id
+                }), 200)
+
+        except Exception as err:
+            error_context = f"operation={operation}, user={user}, source_id={source_id}"
+            if operation == "remove_directory":
+                directory_path = request.form.get("directory_path", "")
+                error_context += f", directory_path={directory_path}"
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths", "")
+                error_context += f", file_paths={file_paths_str}"
+            elif operation == "add":
+                parent_dir = request.form.get("parent_dir", "")
+                error_context += f", parent_dir={parent_dir}"
+
+            current_app.logger.error(f"Error managing source files: {err} ({error_context})", exc_info=True)
+            return make_response(jsonify({"success": False, "message": "Operation failed"}), 500)
+
+
@user_ns.route("/api/remote")
 class UploadRemote(Resource):
    @api.expect(
@@ -834,6 +984,7 @@ class PaginatedSources(Resource):
                    "tokens": doc.get("tokens", ""),
                    "retriever": doc.get("retriever", "classic"),
                    "syncFrequency": doc.get("sync_frequency", ""),
+                    "isNested": bool(doc.get("directory_structure"))
                }
                paginated_docs.append(doc_data)
            response = {
@@ -881,6 +1032,7 @@ class CombinedJson(Resource):
                        "tokens": index.get("tokens", ""),
                        "retriever": index.get("retriever", "classic"),
                        "syncFrequency": index.get("sync_frequency", ""),
+                        "is_nested": bool(index.get("directory_structure"))
                    }
                )
        except Exception as err:
@@ -3374,8 +3526,14 @@ class DeleteTool(Resource):
@user_ns.route("/api/get_chunks")
 class GetChunks(Resource):
    @api.doc(
-        description="Retrieves all chunks associated with a document",
-        params={"id": "The document ID"},
+        description="Retrieves chunks from a document, optionally filtered by file path and search term",
+        params={
+            "id": "The document ID",
+            "page": "Page number for pagination",
+            "per_page": "Number of chunks per page",
+            "path": "Optional: Filter chunks by relative file path",
+            "search": "Optional: Search term to filter chunks by title or content"
+        },
    )
    def get(self):
        decoded_token = request.decoded_token
@@ -3385,6 +3543,8 @@ class GetChunks(Resource):
        doc_id = request.args.get("id")
        page = int(request.args.get("page", 1))
        per_page = int(request.args.get("per_page", 10))
+        path = request.args.get("path")
+        search_term = request.args.get("search", "").strip().lower()

        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3396,6 +3556,30 @@ class GetChunks(Resource):
        try:
            store = get_vector_store(doc_id)
            chunks = store.get_chunks()
+            
+            filtered_chunks = []
+            for chunk in chunks:
+                metadata = chunk.get("metadata", {})
+
+                # Filter by path if provided
+                if path:
+                    chunk_source = metadata.get("source", "")
+                    # Check if the chunk's source matches the requested path
+                    if not chunk_source or not chunk_source.endswith(path):
+                        continue
+
+                # Filter by search term if provided
+                if search_term:
+                    text_match = search_term in chunk.get("text", "").lower()
+                    title_match = search_term in metadata.get("title", "").lower()
+
+                    if not (text_match or title_match):
+                        continue
+
+                filtered_chunks.append(chunk)
+            
+            chunks = filtered_chunks
+            
            total_chunks = len(chunks)
            start = (page - 1) * per_page
            end = start + per_page
@@ -3408,6 +3592,8 @@ class GetChunks(Resource):
                        "per_page": per_page,
                        "total": total_chunks,
                        "chunks": paginated_chunks,
+                        "path": path if path else None,
+                        "search": search_term if search_term else None
                    }
                ),
                200,
@@ -3416,7 +3602,6 @@ class GetChunks(Resource):
            current_app.logger.error(f"Error getting chunks: {e}", exc_info=True)
            return make_response(jsonify({"success": False}), 500)

-
@user_ns.route("/api/add_chunk")
 class AddChunk(Resource):
    @api.expect(
@@ -3448,6 +3633,8 @@ class AddChunk(Resource):
        doc_id = data.get("id")
        text = data.get("text")
        metadata = data.get("metadata", {})
+        token_count = num_tokens_from_string(text)
+        metadata["token_count"] = token_count

        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3544,6 +3731,12 @@ class UpdateChunk(Resource):
        text = data.get("text")
        metadata = data.get("metadata")

+        if text is not None:
+            token_count = num_tokens_from_string(text)
+            if metadata is None:
+                metadata = {}
+            metadata["token_count"] = token_count
+
        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
@@ -3553,31 +3746,45 @@ class UpdateChunk(Resource):
            )
        try:
            store = get_vector_store(doc_id)
+
            chunks = store.get_chunks()
            existing_chunk = next((c for c in chunks if c["doc_id"] == chunk_id), None)
            if not existing_chunk:
                return make_response(jsonify({"error": "Chunk not found"}), 404)
-            deleted = store.delete_chunk(chunk_id)
-            if not deleted:
-                return make_response(
-                    jsonify({"error": "Failed to delete existing chunk"}), 500
-                )
+
            new_text = text if text is not None else existing_chunk["text"]
-            new_metadata = (
-                metadata if metadata is not None else existing_chunk["metadata"]
-            )

-            new_chunk_id = store.add_chunk(new_text, new_metadata)
+            if metadata is not None:
+                new_metadata = existing_chunk["metadata"].copy()
+                new_metadata.update(metadata)
+            else:
+                new_metadata = existing_chunk["metadata"].copy()

-            return make_response(
-                jsonify(
-                    {
-                        "message": "Chunk updated successfully",
-                        "new_chunk_id": new_chunk_id,
-                    }
-                ),
-                200,
-            )
+            if text is not None:
+                new_metadata["token_count"] = num_tokens_from_string(new_text)
+
+            try:
+                new_chunk_id = store.add_chunk(new_text, new_metadata)
+
+                deleted = store.delete_chunk(chunk_id)
+                if not deleted:
+                    current_app.logger.warning(f"Failed to delete old chunk {chunk_id}, but new chunk {new_chunk_id} was created")
+
+                return make_response(
+                    jsonify(
+                        {
+                            "message": "Chunk updated successfully",
+                            "chunk_id": new_chunk_id,
+                            "original_chunk_id": chunk_id,
+                        }
+                    ),
+                    200,
+                )
+            except Exception as add_error:
+                current_app.logger.error(f"Failed to add updated chunk: {add_error}")
+                return make_response(
+                    jsonify({"error": "Failed to update chunk - addition failed"}), 500
+                )
        except Exception as e:
            current_app.logger.error(f"Error updating chunk: {e}", exc_info=True)
            return make_response(jsonify({"success": False}), 500)
@@ -3681,3 +3888,51 @@ class ServeImage(Resource):
            return make_response(
                jsonify({"success": False, "message": "Error retrieving image"}), 500
            )
+
+
+@user_ns.route("/api/directory_structure")
+class DirectoryStructure(Resource):
+    @api.doc(
+        description="Get the directory structure for a document",
+        params={"id": "The document ID"},
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        
+        user = decoded_token.get("sub")
+        doc_id = request.args.get("id")
+        
+        if not doc_id:
+            return make_response(
+                jsonify({"error": "Document ID is required"}), 400
+            )
+            
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid document ID"}), 400)
+            
+        try:
+            doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+            if not doc:
+                return make_response(
+                    jsonify({"error": "Document not found or access denied"}), 404
+                )
+                
+            directory_structure = doc.get("directory_structure", {})
+            
+            return make_response(
+                jsonify({
+                    "success": True,
+                    "directory_structure": directory_structure,
+                    "base_path": doc.get("file_path", "")
+                }), 200
+            )
+            
+        except Exception as e:
+            current_app.logger.error(
+                f"Error retrieving directory structure: {e}", exc_info=True
+            )
+            return make_response(
+                jsonify({"success": False, "error": str(e)}), 500
+            )