refactor: break up monolithic routes.py into modular domain structure

2026-05-04 23:52:00 +00:00 · 2025-10-03 12:30:04 +05:30
parent 55ff7dd640
commit 70183e234a
25 changed files with 5035 additions and 4650 deletions
--- a/application/api/user/sources/init.py
+++ b/application/api/user/sources/init.py
@@ -0,0 +1,7 @@
+"""Sources module."""
+
+from .chunks import sources_chunks_ns
+from .routes import sources_ns
+from .upload import sources_upload_ns
+
+__all__ = ["sources_ns", "sources_chunks_ns", "sources_upload_ns"]
--- a/application/api/user/sources/chunks.py
+++ b/application/api/user/sources/chunks.py
@@ -0,0 +1,278 @@
+"""Source document management chunk management."""
+
+from bson.objectid import ObjectId
+from flask import current_app, jsonify, make_response, request
+from flask_restx import fields, Namespace, Resource
+
+from application.api import api
+from application.api.user.base import get_vector_store, sources_collection
+from application.utils import check_required_fields, num_tokens_from_string
+
+sources_chunks_ns = Namespace(
+    "sources", description="Source document management operations", path="/api"
+)
+
+
+@sources_chunks_ns.route("/get_chunks")
+class GetChunks(Resource):
+    @api.doc(
+        description="Retrieves chunks from a document, optionally filtered by file path and search term",
+        params={
+            "id": "The document ID",
+            "page": "Page number for pagination",
+            "per_page": "Number of chunks per page",
+            "path": "Optional: Filter chunks by relative file path",
+            "search": "Optional: Search term to filter chunks by title or content",
+        },
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        doc_id = request.args.get("id")
+        page = int(request.args.get("page", 1))
+        per_page = int(request.args.get("per_page", 10))
+        path = request.args.get("path")
+        search_term = request.args.get("search", "").strip().lower()
+
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
+        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+        if not doc:
+            return make_response(
+                jsonify({"error": "Document not found or access denied"}), 404
+            )
+        try:
+            store = get_vector_store(doc_id)
+            chunks = store.get_chunks()
+
+            filtered_chunks = []
+            for chunk in chunks:
+                metadata = chunk.get("metadata", {})
+
+                # Filter by path if provided
+
+                if path:
+                    chunk_source = metadata.get("source", "")
+                    # Check if the chunk's source matches the requested path
+
+                    if not chunk_source or not chunk_source.endswith(path):
+                        continue
+                # Filter by search term if provided
+
+                if search_term:
+                    text_match = search_term in chunk.get("text", "").lower()
+                    title_match = search_term in metadata.get("title", "").lower()
+
+                    if not (text_match or title_match):
+                        continue
+                filtered_chunks.append(chunk)
+            chunks = filtered_chunks
+
+            total_chunks = len(chunks)
+            start = (page - 1) * per_page
+            end = start + per_page
+            paginated_chunks = chunks[start:end]
+
+            return make_response(
+                jsonify(
+                    {
+                        "page": page,
+                        "per_page": per_page,
+                        "total": total_chunks,
+                        "chunks": paginated_chunks,
+                        "path": path if path else None,
+                        "search": search_term if search_term else None,
+                    }
+                ),
+                200,
+            )
+        except Exception as e:
+            current_app.logger.error(f"Error getting chunks: {e}", exc_info=True)
+            return make_response(jsonify({"success": False}), 500)
+
+
+@sources_chunks_ns.route("/add_chunk")
+class AddChunk(Resource):
+    @api.expect(
+        api.model(
+            "AddChunkModel",
+            {
+                "id": fields.String(required=True, description="Document ID"),
+                "text": fields.String(required=True, description="Text of the chunk"),
+                "metadata": fields.Raw(
+                    required=False,
+                    description="Metadata associated with the chunk",
+                ),
+            },
+        )
+    )
+    @api.doc(
+        description="Adds a new chunk to the document",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        data = request.get_json()
+        required_fields = ["id", "text"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields:
+            return missing_fields
+        doc_id = data.get("id")
+        text = data.get("text")
+        metadata = data.get("metadata", {})
+        token_count = num_tokens_from_string(text)
+        metadata["token_count"] = token_count
+
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
+        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+        if not doc:
+            return make_response(
+                jsonify({"error": "Document not found or access denied"}), 404
+            )
+        try:
+            store = get_vector_store(doc_id)
+            chunk_id = store.add_chunk(text, metadata)
+            return make_response(
+                jsonify({"message": "Chunk added successfully", "chunk_id": chunk_id}),
+                201,
+            )
+        except Exception as e:
+            current_app.logger.error(f"Error adding chunk: {e}", exc_info=True)
+            return make_response(jsonify({"success": False}), 500)
+
+
+@sources_chunks_ns.route("/delete_chunk")
+class DeleteChunk(Resource):
+    @api.doc(
+        description="Deletes a specific chunk from the document.",
+        params={"id": "The document ID", "chunk_id": "The ID of the chunk to delete"},
+    )
+    def delete(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        doc_id = request.args.get("id")
+        chunk_id = request.args.get("chunk_id")
+
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
+        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+        if not doc:
+            return make_response(
+                jsonify({"error": "Document not found or access denied"}), 404
+            )
+        try:
+            store = get_vector_store(doc_id)
+            deleted = store.delete_chunk(chunk_id)
+            if deleted:
+                return make_response(
+                    jsonify({"message": "Chunk deleted successfully"}), 200
+                )
+            else:
+                return make_response(
+                    jsonify({"message": "Chunk not found or could not be deleted"}),
+                    404,
+                )
+        except Exception as e:
+            current_app.logger.error(f"Error deleting chunk: {e}", exc_info=True)
+            return make_response(jsonify({"success": False}), 500)
+
+
+@sources_chunks_ns.route("/update_chunk")
+class UpdateChunk(Resource):
+    @api.expect(
+        api.model(
+            "UpdateChunkModel",
+            {
+                "id": fields.String(required=True, description="Document ID"),
+                "chunk_id": fields.String(
+                    required=True, description="Chunk ID to update"
+                ),
+                "text": fields.String(
+                    required=False, description="New text of the chunk"
+                ),
+                "metadata": fields.Raw(
+                    required=False,
+                    description="Updated metadata associated with the chunk",
+                ),
+            },
+        )
+    )
+    @api.doc(
+        description="Updates an existing chunk in the document.",
+    )
+    def put(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        data = request.get_json()
+        required_fields = ["id", "chunk_id"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields:
+            return missing_fields
+        doc_id = data.get("id")
+        chunk_id = data.get("chunk_id")
+        text = data.get("text")
+        metadata = data.get("metadata")
+
+        if text is not None:
+            token_count = num_tokens_from_string(text)
+            if metadata is None:
+                metadata = {}
+            metadata["token_count"] = token_count
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
+        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+        if not doc:
+            return make_response(
+                jsonify({"error": "Document not found or access denied"}), 404
+            )
+        try:
+            store = get_vector_store(doc_id)
+
+            chunks = store.get_chunks()
+            existing_chunk = next((c for c in chunks if c["doc_id"] == chunk_id), None)
+            if not existing_chunk:
+                return make_response(jsonify({"error": "Chunk not found"}), 404)
+            new_text = text if text is not None else existing_chunk["text"]
+
+            if metadata is not None:
+                new_metadata = existing_chunk["metadata"].copy()
+                new_metadata.update(metadata)
+            else:
+                new_metadata = existing_chunk["metadata"].copy()
+            if text is not None:
+                new_metadata["token_count"] = num_tokens_from_string(new_text)
+            try:
+                new_chunk_id = store.add_chunk(new_text, new_metadata)
+
+                deleted = store.delete_chunk(chunk_id)
+                if not deleted:
+                    current_app.logger.warning(
+                        f"Failed to delete old chunk {chunk_id}, but new chunk {new_chunk_id} was created"
+                    )
+                return make_response(
+                    jsonify(
+                        {
+                            "message": "Chunk updated successfully",
+                            "chunk_id": new_chunk_id,
+                            "original_chunk_id": chunk_id,
+                        }
+                    ),
+                    200,
+                )
+            except Exception as add_error:
+                current_app.logger.error(f"Failed to add updated chunk: {add_error}")
+                return make_response(
+                    jsonify({"error": "Failed to update chunk - addition failed"}), 500
+                )
+        except Exception as e:
+            current_app.logger.error(f"Error updating chunk: {e}", exc_info=True)
+            return make_response(jsonify({"success": False}), 500)
--- a/application/api/user/sources/routes.py
+++ b/application/api/user/sources/routes.py
@@ -0,0 +1,350 @@
+"""Source document management routes."""
+
+import json
+import math
+import os
+
+from bson.objectid import ObjectId
+from flask import current_app, jsonify, make_response, redirect, request
+from flask_restx import fields, Namespace, Resource
+from werkzeug.utils import secure_filename
+
+from application.api import api
+from application.api.user.base import sources_collection
+from application.core.settings import settings
+from application.storage.storage_creator import StorageCreator
+from application.utils import check_required_fields
+from application.vectorstore.vector_creator import VectorCreator
+
+
+sources_ns = Namespace(
+    "sources", description="Source document management operations", path="/api"
+)
+
+
+@sources_ns.route("/sources")
+class CombinedJson(Resource):
+    @api.doc(description="Provide JSON file with combined available indexes")
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        data = [
+            {
+                "name": "Default",
+                "date": "default",
+                "model": settings.EMBEDDINGS_NAME,
+                "location": "remote",
+                "tokens": "",
+                "retriever": "classic",
+            }
+        ]
+
+        try:
+            for index in sources_collection.find({"user": user}).sort("date", -1):
+                data.append(
+                    {
+                        "id": str(index["_id"]),
+                        "name": index.get("name"),
+                        "date": index.get("date"),
+                        "model": settings.EMBEDDINGS_NAME,
+                        "location": "local",
+                        "tokens": index.get("tokens", ""),
+                        "retriever": index.get("retriever", "classic"),
+                        "syncFrequency": index.get("sync_frequency", ""),
+                        "is_nested": bool(index.get("directory_structure")),
+                        "type": index.get(
+                            "type", "file"
+                        ),  # Add type field with default "file"
+                    }
+                )
+        except Exception as err:
+            current_app.logger.error(f"Error retrieving sources: {err}", exc_info=True)
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify(data), 200)
+
+
+@sources_ns.route("/sources/paginated")
+class PaginatedSources(Resource):
+    @api.doc(description="Get document with pagination, sorting and filtering")
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        sort_field = request.args.get("sort", "date")  # Default to 'date'
+        sort_order = request.args.get("order", "desc")  # Default to 'desc'
+        page = int(request.args.get("page", 1))  # Default to 1
+        rows_per_page = int(request.args.get("rows", 10))  # Default to 10
+        # add .strip() to remove leading and trailing whitespaces
+
+        search_term = request.args.get(
+            "search", ""
+        ).strip()  # add search for filter documents
+
+        # Prepare query for filtering
+
+        query = {"user": user}
+        if search_term:
+            query["name"] = {
+                "$regex": search_term,
+                "$options": "i",  # using case-insensitive search
+            }
+        total_documents = sources_collection.count_documents(query)
+        total_pages = max(1, math.ceil(total_documents / rows_per_page))
+        page = min(
+            max(1, page), total_pages
+        )  # add this to make sure page inbound is within the range
+        sort_order = 1 if sort_order == "asc" else -1
+        skip = (page - 1) * rows_per_page
+
+        try:
+            documents = (
+                sources_collection.find(query)
+                .sort(sort_field, sort_order)
+                .skip(skip)
+                .limit(rows_per_page)
+            )
+
+            paginated_docs = []
+            for doc in documents:
+                doc_data = {
+                    "id": str(doc["_id"]),
+                    "name": doc.get("name", ""),
+                    "date": doc.get("date", ""),
+                    "model": settings.EMBEDDINGS_NAME,
+                    "location": "local",
+                    "tokens": doc.get("tokens", ""),
+                    "retriever": doc.get("retriever", "classic"),
+                    "syncFrequency": doc.get("sync_frequency", ""),
+                    "isNested": bool(doc.get("directory_structure")),
+                    "type": doc.get("type", "file"),
+                }
+                paginated_docs.append(doc_data)
+            response = {
+                "total": total_documents,
+                "totalPages": total_pages,
+                "currentPage": page,
+                "paginated": paginated_docs,
+            }
+            return make_response(jsonify(response), 200)
+        except Exception as err:
+            current_app.logger.error(
+                f"Error retrieving paginated sources: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+
+
+@sources_ns.route("/docs_check")
+class CheckDocs(Resource):
+    check_docs_model = api.model(
+        "CheckDocsModel",
+        {"docs": fields.String(required=True, description="Document name")},
+    )
+
+    @api.expect(check_docs_model)
+    @api.doc(description="Check if document exists")
+    def post(self):
+        data = request.get_json()
+        required_fields = ["docs"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields:
+            return missing_fields
+        try:
+            vectorstore = "vectors/" + secure_filename(data["docs"])
+            if os.path.exists(vectorstore) or data["docs"] == "default":
+                return {"status": "exists"}, 200
+        except Exception as err:
+            current_app.logger.error(f"Error checking document: {err}", exc_info=True)
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"status": "not found"}), 404)
+
+
+@sources_ns.route("/delete_by_ids")
+class DeleteByIds(Resource):
+    @api.doc(
+        description="Deletes documents from the vector store by IDs",
+        params={"path": "Comma-separated list of IDs"},
+    )
+    def get(self):
+        ids = request.args.get("path")
+        if not ids:
+            return make_response(
+                jsonify({"success": False, "message": "Missing required fields"}), 400
+            )
+        try:
+            result = sources_collection.delete_index(ids=ids)
+            if result:
+                return make_response(jsonify({"success": True}), 200)
+        except Exception as err:
+            current_app.logger.error(f"Error deleting indexes: {err}", exc_info=True)
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"success": False}), 400)
+
+
+@sources_ns.route("/delete_old")
+class DeleteOldIndexes(Resource):
+    @api.doc(
+        description="Deletes old indexes and associated files",
+        params={"source_id": "The source ID to delete"},
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        source_id = request.args.get("source_id")
+        if not source_id:
+            return make_response(
+                jsonify({"success": False, "message": "Missing required fields"}), 400
+            )
+        doc = sources_collection.find_one(
+            {"_id": ObjectId(source_id), "user": decoded_token.get("sub")}
+        )
+        if not doc:
+            return make_response(jsonify({"status": "not found"}), 404)
+        storage = StorageCreator.get_storage()
+
+        try:
+            # Delete vector index
+
+            if settings.VECTOR_STORE == "faiss":
+                index_path = f"indexes/{str(doc['_id'])}"
+                if storage.file_exists(f"{index_path}/index.faiss"):
+                    storage.delete_file(f"{index_path}/index.faiss")
+                if storage.file_exists(f"{index_path}/index.pkl"):
+                    storage.delete_file(f"{index_path}/index.pkl")
+            else:
+                vectorstore = VectorCreator.create_vectorstore(
+                    settings.VECTOR_STORE, source_id=str(doc["_id"])
+                )
+                vectorstore.delete_index()
+            if "file_path" in doc and doc["file_path"]:
+                file_path = doc["file_path"]
+                if storage.is_directory(file_path):
+                    files = storage.list_files(file_path)
+                    for f in files:
+                        storage.delete_file(f)
+                else:
+                    storage.delete_file(file_path)
+        except FileNotFoundError:
+            pass
+        except Exception as err:
+            current_app.logger.error(
+                f"Error deleting files and indexes: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+        sources_collection.delete_one({"_id": ObjectId(source_id)})
+        return make_response(jsonify({"success": True}), 200)
+
+
+@sources_ns.route("/combine")
+class RedirectToSources(Resource):
+    @api.doc(
+        description="Redirects /api/combine to /api/sources for backward compatibility"
+    )
+    def get(self):
+        return redirect("/api/sources", code=301)
+
+
+@sources_ns.route("/manage_sync")
+class ManageSync(Resource):
+    manage_sync_model = api.model(
+        "ManageSyncModel",
+        {
+            "source_id": fields.String(required=True, description="Source ID"),
+            "sync_frequency": fields.String(
+                required=True,
+                description="Sync frequency (never, daily, weekly, monthly)",
+            ),
+        },
+    )
+
+    @api.expect(manage_sync_model)
+    @api.doc(description="Manage sync frequency for sources")
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        data = request.get_json()
+        required_fields = ["source_id", "sync_frequency"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields:
+            return missing_fields
+        source_id = data["source_id"]
+        sync_frequency = data["sync_frequency"]
+
+        if sync_frequency not in ["never", "daily", "weekly", "monthly"]:
+            return make_response(
+                jsonify({"success": False, "message": "Invalid frequency"}), 400
+            )
+        update_data = {"$set": {"sync_frequency": sync_frequency}}
+        try:
+            sources_collection.update_one(
+                {
+                    "_id": ObjectId(source_id),
+                    "user": user,
+                },
+                update_data,
+            )
+        except Exception as err:
+            current_app.logger.error(
+                f"Error updating sync frequency: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"success": True}), 200)
+
+
+@sources_ns.route("/directory_structure")
+class DirectoryStructure(Resource):
+    @api.doc(
+        description="Get the directory structure for a document",
+        params={"id": "The document ID"},
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        user = decoded_token.get("sub")
+        doc_id = request.args.get("id")
+
+        if not doc_id:
+            return make_response(jsonify({"error": "Document ID is required"}), 400)
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid document ID"}), 400)
+        try:
+            doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+            if not doc:
+                return make_response(
+                    jsonify({"error": "Document not found or access denied"}), 404
+                )
+            directory_structure = doc.get("directory_structure", {})
+            base_path = doc.get("file_path", "")
+
+            provider = None
+            remote_data = doc.get("remote_data")
+            try:
+                if isinstance(remote_data, str) and remote_data:
+                    remote_data_obj = json.loads(remote_data)
+                    provider = remote_data_obj.get("provider")
+            except Exception as e:
+                current_app.logger.warning(
+                    f"Failed to parse remote_data for doc {doc_id}: {e}"
+                )
+            return make_response(
+                jsonify(
+                    {
+                        "success": True,
+                        "directory_structure": directory_structure,
+                        "base_path": base_path,
+                        "provider": provider,
+                    }
+                ),
+                200,
+            )
+        except Exception as e:
+            current_app.logger.error(
+                f"Error retrieving directory structure: {e}", exc_info=True
+            )
+            return make_response(jsonify({"success": False, "error": str(e)}), 500)
--- a/application/api/user/sources/upload.py
+++ b/application/api/user/sources/upload.py
@@ -0,0 +1,570 @@
+"""Source document management upload functionality."""
+
+import json, tempfile, zipfile
+import os
+
+from bson.objectid import ObjectId
+from flask import current_app, jsonify, make_response, request
+from flask_restx import fields, Namespace, Resource
+
+from application.api import api
+from application.api.user.base import sources_collection
+from application.api.user.tasks import ingest, ingest_connector_task, ingest_remote
+from application.core.settings import settings
+from application.parser.connectors.connector_creator import ConnectorCreator
+from application.storage.storage_creator import StorageCreator
+from application.utils import check_required_fields, safe_filename
+
+
+sources_upload_ns = Namespace(
+    "sources", description="Source document management operations", path="/api"
+)
+
+
+@sources_upload_ns.route("/upload")
+class UploadFile(Resource):
+    @api.expect(
+        api.model(
+            "UploadModel",
+            {
+                "user": fields.String(required=True, description="User ID"),
+                "name": fields.String(required=True, description="Job name"),
+                "file": fields.Raw(required=True, description="File(s) to upload"),
+            },
+        )
+    )
+    @api.doc(
+        description="Uploads a file to be vectorized and indexed",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        data = request.form
+        files = request.files.getlist("file")
+        required_fields = ["user", "name"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields or not files or all(file.filename == "" for file in files):
+            return make_response(
+                jsonify(
+                    {
+                        "status": "error",
+                        "message": "Missing required fields or files",
+                    }
+                ),
+                400,
+            )
+        user = decoded_token.get("sub")
+        job_name = request.form["name"]
+
+        # Create safe versions for filesystem operations
+
+        safe_user = safe_filename(user)
+        dir_name = safe_filename(job_name)
+        base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
+
+        try:
+            storage = StorageCreator.get_storage()
+
+            for file in files:
+                original_filename = file.filename
+                safe_file = safe_filename(original_filename)
+
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_file_path = os.path.join(temp_dir, safe_file)
+                    file.save(temp_file_path)
+
+                    if zipfile.is_zipfile(temp_file_path):
+                        try:
+                            with zipfile.ZipFile(temp_file_path, "r") as zip_ref:
+                                zip_ref.extractall(path=temp_dir)
+
+                                # Walk through extracted files and upload them
+
+                                for root, _, files in os.walk(temp_dir):
+                                    for extracted_file in files:
+                                        if (
+                                            os.path.join(root, extracted_file)
+                                            == temp_file_path
+                                        ):
+                                            continue
+                                        rel_path = os.path.relpath(
+                                            os.path.join(root, extracted_file), temp_dir
+                                        )
+                                        storage_path = f"{base_path}/{rel_path}"
+
+                                        with open(
+                                            os.path.join(root, extracted_file), "rb"
+                                        ) as f:
+                                            storage.save_file(f, storage_path)
+                        except Exception as e:
+                            current_app.logger.error(
+                                f"Error extracting zip: {e}", exc_info=True
+                            )
+                            # If zip extraction fails, save the original zip file
+
+                            file_path = f"{base_path}/{safe_file}"
+                            with open(temp_file_path, "rb") as f:
+                                storage.save_file(f, file_path)
+                    else:
+                        # For non-zip files, save directly
+
+                        file_path = f"{base_path}/{safe_file}"
+                        with open(temp_file_path, "rb") as f:
+                            storage.save_file(f, file_path)
+            task = ingest.delay(
+                settings.UPLOAD_FOLDER,
+                [
+                    ".rst",
+                    ".md",
+                    ".pdf",
+                    ".txt",
+                    ".docx",
+                    ".csv",
+                    ".epub",
+                    ".html",
+                    ".mdx",
+                    ".json",
+                    ".xlsx",
+                    ".pptx",
+                    ".png",
+                    ".jpg",
+                    ".jpeg",
+                ],
+                job_name,
+                user,
+                file_path=base_path,
+                filename=dir_name,
+            )
+        except Exception as err:
+            current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"success": True, "task_id": task.id}), 200)
+
+
+@sources_upload_ns.route("/remote")
+class UploadRemote(Resource):
+    @api.expect(
+        api.model(
+            "RemoteUploadModel",
+            {
+                "user": fields.String(required=True, description="User ID"),
+                "source": fields.String(
+                    required=True, description="Source of the data"
+                ),
+                "name": fields.String(required=True, description="Job name"),
+                "data": fields.String(required=True, description="Data to process"),
+                "repo_url": fields.String(description="GitHub repository URL"),
+            },
+        )
+    )
+    @api.doc(
+        description="Uploads remote source for vectorization",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        data = request.form
+        required_fields = ["user", "source", "name", "data"]
+        missing_fields = check_required_fields(data, required_fields)
+        if missing_fields:
+            return missing_fields
+        try:
+            config = json.loads(data["data"])
+            source_data = None
+
+            if data["source"] == "github":
+                source_data = config.get("repo_url")
+            elif data["source"] in ["crawler", "url"]:
+                source_data = config.get("url")
+            elif data["source"] == "reddit":
+                source_data = config
+            elif data["source"] in ConnectorCreator.get_supported_connectors():
+                session_token = config.get("session_token")
+                if not session_token:
+                    return make_response(
+                        jsonify(
+                            {
+                                "success": False,
+                                "error": f"Missing session_token in {data['source']} configuration",
+                            }
+                        ),
+                        400,
+                    )
+                # Process file_ids
+
+                file_ids = config.get("file_ids", [])
+                if isinstance(file_ids, str):
+                    file_ids = [id.strip() for id in file_ids.split(",") if id.strip()]
+                elif not isinstance(file_ids, list):
+                    file_ids = []
+                # Process folder_ids
+
+                folder_ids = config.get("folder_ids", [])
+                if isinstance(folder_ids, str):
+                    folder_ids = [
+                        id.strip() for id in folder_ids.split(",") if id.strip()
+                    ]
+                elif not isinstance(folder_ids, list):
+                    folder_ids = []
+                config["file_ids"] = file_ids
+                config["folder_ids"] = folder_ids
+
+                task = ingest_connector_task.delay(
+                    job_name=data["name"],
+                    user=decoded_token.get("sub"),
+                    source_type=data["source"],
+                    session_token=session_token,
+                    file_ids=file_ids,
+                    folder_ids=folder_ids,
+                    recursive=config.get("recursive", False),
+                    retriever=config.get("retriever", "classic"),
+                )
+                return make_response(
+                    jsonify({"success": True, "task_id": task.id}), 200
+                )
+            task = ingest_remote.delay(
+                source_data=source_data,
+                job_name=data["name"],
+                user=decoded_token.get("sub"),
+                loader=data["source"],
+            )
+        except Exception as err:
+            current_app.logger.error(
+                f"Error uploading remote source: {err}", exc_info=True
+            )
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"success": True, "task_id": task.id}), 200)
+
+
+@sources_upload_ns.route("/manage_source_files")
+class ManageSourceFiles(Resource):
+    @api.expect(
+        api.model(
+            "ManageSourceFilesModel",
+            {
+                "source_id": fields.String(
+                    required=True, description="Source ID to modify"
+                ),
+                "operation": fields.String(
+                    required=True,
+                    description="Operation: 'add', 'remove', or 'remove_directory'",
+                ),
+                "file_paths": fields.List(
+                    fields.String,
+                    required=False,
+                    description="File paths to remove (for remove operation)",
+                ),
+                "directory_path": fields.String(
+                    required=False,
+                    description="Directory path to remove (for remove_directory operation)",
+                ),
+                "file": fields.Raw(
+                    required=False, description="Files to add (for add operation)"
+                ),
+                "parent_dir": fields.String(
+                    required=False,
+                    description="Parent directory path relative to source root",
+                ),
+            },
+        )
+    )
+    @api.doc(
+        description="Add files, remove files, or remove directories from an existing source",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(
+                jsonify({"success": False, "message": "Unauthorized"}), 401
+            )
+        user = decoded_token.get("sub")
+        source_id = request.form.get("source_id")
+        operation = request.form.get("operation")
+
+        if not source_id or not operation:
+            return make_response(
+                jsonify(
+                    {
+                        "success": False,
+                        "message": "source_id and operation are required",
+                    }
+                ),
+                400,
+            )
+        if operation not in ["add", "remove", "remove_directory"]:
+            return make_response(
+                jsonify(
+                    {
+                        "success": False,
+                        "message": "operation must be 'add', 'remove', or 'remove_directory'",
+                    }
+                ),
+                400,
+            )
+        try:
+            ObjectId(source_id)
+        except Exception:
+            return make_response(
+                jsonify({"success": False, "message": "Invalid source ID format"}), 400
+            )
+        try:
+            source = sources_collection.find_one(
+                {"_id": ObjectId(source_id), "user": user}
+            )
+            if not source:
+                return make_response(
+                    jsonify(
+                        {
+                            "success": False,
+                            "message": "Source not found or access denied",
+                        }
+                    ),
+                    404,
+                )
+        except Exception as err:
+            current_app.logger.error(f"Error finding source: {err}", exc_info=True)
+            return make_response(
+                jsonify({"success": False, "message": "Database error"}), 500
+            )
+        try:
+            storage = StorageCreator.get_storage()
+            source_file_path = source.get("file_path", "")
+            parent_dir = request.form.get("parent_dir", "")
+
+            if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
+                return make_response(
+                    jsonify(
+                        {"success": False, "message": "Invalid parent directory path"}
+                    ),
+                    400,
+                )
+            if operation == "add":
+                files = request.files.getlist("file")
+                if not files or all(file.filename == "" for file in files):
+                    return make_response(
+                        jsonify(
+                            {
+                                "success": False,
+                                "message": "No files provided for add operation",
+                            }
+                        ),
+                        400,
+                    )
+                added_files = []
+
+                target_dir = source_file_path
+                if parent_dir:
+                    target_dir = f"{source_file_path}/{parent_dir}"
+                for file in files:
+                    if file.filename:
+                        safe_filename_str = safe_filename(file.filename)
+                        file_path = f"{target_dir}/{safe_filename_str}"
+
+                        # Save file to storage
+
+                        storage.save_file(file, file_path)
+                        added_files.append(safe_filename_str)
+                # Trigger re-ingestion pipeline
+
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(
+                    jsonify(
+                        {
+                            "success": True,
+                            "message": f"Added {len(added_files)} files",
+                            "added_files": added_files,
+                            "parent_dir": parent_dir,
+                            "reingest_task_id": task.id,
+                        }
+                    ),
+                    200,
+                )
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths")
+                if not file_paths_str:
+                    return make_response(
+                        jsonify(
+                            {
+                                "success": False,
+                                "message": "file_paths required for remove operation",
+                            }
+                        ),
+                        400,
+                    )
+                try:
+                    file_paths = (
+                        json.loads(file_paths_str)
+                        if isinstance(file_paths_str, str)
+                        else file_paths_str
+                    )
+                except Exception:
+                    return make_response(
+                        jsonify(
+                            {"success": False, "message": "Invalid file_paths format"}
+                        ),
+                        400,
+                    )
+                # Remove files from storage and directory structure
+
+                removed_files = []
+                for file_path in file_paths:
+                    full_path = f"{source_file_path}/{file_path}"
+
+                    # Remove from storage
+
+                    if storage.file_exists(full_path):
+                        storage.delete_file(full_path)
+                        removed_files.append(file_path)
+                # Trigger re-ingestion pipeline
+
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(
+                    jsonify(
+                        {
+                            "success": True,
+                            "message": f"Removed {len(removed_files)} files",
+                            "removed_files": removed_files,
+                            "reingest_task_id": task.id,
+                        }
+                    ),
+                    200,
+                )
+            elif operation == "remove_directory":
+                directory_path = request.form.get("directory_path")
+                if not directory_path:
+                    return make_response(
+                        jsonify(
+                            {
+                                "success": False,
+                                "message": "directory_path required for remove_directory operation",
+                            }
+                        ),
+                        400,
+                    )
+                # Validate directory path (prevent path traversal)
+
+                if directory_path.startswith("/") or ".." in directory_path:
+                    current_app.logger.warning(
+                        f"Invalid directory path attempted for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}"
+                    )
+                    return make_response(
+                        jsonify(
+                            {"success": False, "message": "Invalid directory path"}
+                        ),
+                        400,
+                    )
+                full_directory_path = (
+                    f"{source_file_path}/{directory_path}"
+                    if directory_path
+                    else source_file_path
+                )
+
+                if not storage.is_directory(full_directory_path):
+                    current_app.logger.warning(
+                        f"Directory not found or is not a directory for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify(
+                            {
+                                "success": False,
+                                "message": "Directory not found or is not a directory",
+                            }
+                        ),
+                        404,
+                    )
+                success = storage.remove_directory(full_directory_path)
+
+                if not success:
+                    current_app.logger.error(
+                        f"Failed to remove directory from storage. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify(
+                            {"success": False, "message": "Failed to remove directory"}
+                        ),
+                        500,
+                    )
+                current_app.logger.info(
+                    f"Successfully removed directory. "
+                    f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                    f"Full path: {full_directory_path}"
+                )
+
+                # Trigger re-ingestion pipeline
+
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(
+                    jsonify(
+                        {
+                            "success": True,
+                            "message": f"Successfully removed directory: {directory_path}",
+                            "removed_directory": directory_path,
+                            "reingest_task_id": task.id,
+                        }
+                    ),
+                    200,
+                )
+        except Exception as err:
+            error_context = f"operation={operation}, user={user}, source_id={source_id}"
+            if operation == "remove_directory":
+                directory_path = request.form.get("directory_path", "")
+                error_context += f", directory_path={directory_path}"
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths", "")
+                error_context += f", file_paths={file_paths_str}"
+            elif operation == "add":
+                parent_dir = request.form.get("parent_dir", "")
+                error_context += f", parent_dir={parent_dir}"
+            current_app.logger.error(
+                f"Error managing source files: {err} ({error_context})", exc_info=True
+            )
+            return make_response(
+                jsonify({"success": False, "message": "Operation failed"}), 500
+            )
+
+
+@sources_upload_ns.route("/task_status")
+class TaskStatus(Resource):
+    task_status_model = api.model(
+        "TaskStatusModel",
+        {"task_id": fields.String(required=True, description="Task ID")},
+    )
+
+    @api.expect(task_status_model)
+    @api.doc(description="Get celery job status")
+    def get(self):
+        task_id = request.args.get("task_id")
+        if not task_id:
+            return make_response(
+                jsonify({"success": False, "message": "Task ID is required"}), 400
+            )
+        try:
+            from application.celery_init import celery
+
+            task = celery.AsyncResult(task_id)
+            task_meta = task.info
+            print(f"Task status: {task.status}")
+            if not isinstance(
+                task_meta, (dict, list, str, int, float, bool, type(None))
+            ):
+                task_meta = str(task_meta)  # Convert to a string representation
+        except Exception as err:
+            current_app.logger.error(f"Error getting task status: {err}", exc_info=True)
+            return make_response(jsonify({"success": False}), 400)
+        return make_response(jsonify({"status": task.status, "result": task_meta}), 200)