Merge pull request #1873 from ManishMadan2882/main

Sources are the new Docs
2026-02-26 06:14:51 +00:00 · 2025-08-13 18:24:35 +01:00
parent cd0fbf79a3 d1a0fe6e91
commit 6a02bcf15b
40 changed files with 3412 additions and 1119 deletions
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -1,5 +1,6 @@
 import os
 import datetime
+import json
 from flask import Blueprint, request, send_from_directory
 from werkzeug.utils import secure_filename
 from bson.objectid import ObjectId
@@ -48,7 +49,17 @@ def upload_index_files():
    remote_data = request.form["remote_data"] if "remote_data" in request.form else None
    sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
    
-    original_file_path = request.form.get("original_file_path")
+    file_path = request.form.get("file_path")
+    directory_structure = request.form.get("directory_structure")
+    
+    if directory_structure:
+        try:
+            directory_structure = json.loads(directory_structure)
+        except Exception:
+            logger.error("Error parsing directory_structure")
+            directory_structure = {}
+    else:
+        directory_structure = {}

    storage = StorageCreator.get_storage()
    index_base_path = f"indexes/{id}"
@@ -66,10 +77,13 @@ def upload_index_files():
        file_pkl = request.files["file_pkl"]
        if file_pkl.filename == "":
            return {"status": "no file name"}
-        
+
        # Save index files to storage
-        storage.save_file(file_faiss, f"{index_base_path}/index.faiss")
-        storage.save_file(file_pkl, f"{index_base_path}/index.pkl")
+        faiss_storage_path = f"{index_base_path}/index.faiss"
+        pkl_storage_path = f"{index_base_path}/index.pkl"
+        storage.save_file(file_faiss, faiss_storage_path)
+        storage.save_file(file_pkl, pkl_storage_path)
+

    existing_entry = sources_collection.find_one({"_id": ObjectId(id)})
    if existing_entry:
@@ -87,7 +101,8 @@ def upload_index_files():
                    "retriever": retriever,
                    "remote_data": remote_data,
                    "sync_frequency": sync_frequency,
-                    "file_path": original_file_path,
+                    "file_path": file_path,
+                    "directory_structure": directory_structure,
                }
            },
        )
@@ -105,7 +120,8 @@ def upload_index_files():
                "retriever": retriever,
                "remote_data": remote_data,
                "sync_frequency": sync_frequency,
-                "file_path": original_file_path,
+                "file_path": file_path,
+                "directory_structure": directory_structure,
            }
        )
    return {"status": "ok"}
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -3,11 +3,11 @@ import json
 import math
 import os
 import secrets
-import shutil
 import uuid
 from functools import wraps
 from typing import Optional, Tuple
-
+import tempfile
+import zipfile
 from bson.binary import Binary, UuidRepresentation
 from bson.dbref import DBRef
 from bson.objectid import ObjectId
@@ -44,6 +44,7 @@ from application.utils import (
    validate_function_name,
    validate_required_fields,
 )
+from application.utils import num_tokens_from_string
 from application.vectorstore.vector_creator import VectorCreator

 storage = StorageCreator.get_storage()
@@ -474,7 +475,7 @@ class DeleteByIds(Resource):
@user_ns.route("/api/delete_old")
 class DeleteOldIndexes(Resource):
    @api.doc(
-        description="Deletes old indexes",
+        description="Deletes old indexes and associated files",
        params={"source_id": "The source ID to delete"},
    )
    def get(self):
@@ -491,21 +492,40 @@ class DeleteOldIndexes(Resource):
        )
        if not doc:
            return make_response(jsonify({"status": "not found"}), 404)
+        
+        storage = StorageCreator.get_storage()
+        
        try:
+            # Delete vector index
            if settings.VECTOR_STORE == "faiss":
-                shutil.rmtree(os.path.join(current_dir, "indexes", str(doc["_id"])))
+                index_path = f"indexes/{str(doc['_id'])}"
+                if storage.file_exists(f"{index_path}/index.faiss"):
+                    storage.delete_file(f"{index_path}/index.faiss")
+                if storage.file_exists(f"{index_path}/index.pkl"):
+                    storage.delete_file(f"{index_path}/index.pkl")
            else:
                vectorstore = VectorCreator.create_vectorstore(
                    settings.VECTOR_STORE, source_id=str(doc["_id"])
                )
                vectorstore.delete_index()
+                
+            if "file_path" in doc and doc["file_path"]:
+                file_path = doc["file_path"]
+                if storage.is_directory(file_path):
+                    files = storage.list_files(file_path)
+                    for f in files:
+                        storage.delete_file(f)
+                else:
+                    storage.delete_file(file_path)
+                    
        except FileNotFoundError:
            pass
        except Exception as err:
            current_app.logger.error(
-                f"Error deleting old indexes: {err}", exc_info=True
+                f"Error deleting files and indexes: {err}", exc_info=True
            )
            return make_response(jsonify({"success": False}), 400)
+            
        sources_collection.delete_one({"_id": ObjectId(source_id)})
        return make_response(jsonify({"success": True}), 200)

@@ -549,146 +569,276 @@ class UploadFile(Resource):
        # Create safe versions for filesystem operations
        safe_user = safe_filename(user)
        dir_name = safe_filename(job_name)
+        base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"

        try:
            storage = StorageCreator.get_storage()
-            base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
-
-            if len(files) > 1:
-                temp_files = []
-                for file in files:
-                    filename = safe_filename(file.filename)
-                    temp_path = f"{base_path}/temp/{filename}"
-                    storage.save_file(file, temp_path)
-                    temp_files.append(temp_path)
-                    print(f"Saved file: {filename}")
-                zip_filename = f"{dir_name}.zip"
-                zip_path = f"{base_path}/{zip_filename}"
-                zip_temp_path = None
-
-                def create_zip_archive(temp_paths, dir_name, storage):
-                    import tempfile
-
-                    with tempfile.NamedTemporaryFile(
-                        delete=False, suffix=".zip"
-                    ) as temp_zip_file:
-                        zip_output_path = temp_zip_file.name
-                    with tempfile.TemporaryDirectory() as stage_dir:
-                        for path in temp_paths:
-                            try:
-                                file_data = storage.get_file(path)
-                                with open(
-                                    os.path.join(stage_dir, os.path.basename(path)),
-                                    "wb",
-                                ) as f:
-                                    f.write(file_data.read())
-                            except Exception as e:
-                                current_app.logger.error(
-                                    f"Error processing file {path} for zipping: {e}",
-                                    exc_info=True,
-                                )
-                                if os.path.exists(zip_output_path):
-                                    os.remove(zip_output_path)
-                                raise
+            
+            
+            for file in files:
+                original_filename = file.filename
+                safe_file = safe_filename(original_filename)
+                
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_file_path = os.path.join(temp_dir, safe_file)
+                    file.save(temp_file_path)
+                    
+                    if zipfile.is_zipfile(temp_file_path):
                        try:
-                            shutil.make_archive(
-                                base_name=zip_output_path.replace(".zip", ""),
-                                format="zip",
-                                root_dir=stage_dir,
-                            )
+                            with zipfile.ZipFile(temp_file_path, 'r') as zip_ref:
+                                zip_ref.extractall(path=temp_dir)
+                                
+                                # Walk through extracted files and upload them
+                                for root, _, files in os.walk(temp_dir):
+                                    for extracted_file in files:
+                                        if os.path.join(root, extracted_file) == temp_file_path:
+                                            continue
+                                            
+                                        rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir)
+                                        storage_path = f"{base_path}/{rel_path}"
+                                        
+                                        with open(os.path.join(root, extracted_file), 'rb') as f:
+                                            storage.save_file(f, storage_path)
                        except Exception as e:
-                            current_app.logger.error(
-                                f"Error creating zip archive: {e}", exc_info=True
-                            )
-                            if os.path.exists(zip_output_path):
-                                os.remove(zip_output_path)
-                            raise
-                    return zip_output_path
-
-                try:
-                    zip_temp_path = create_zip_archive(temp_files, dir_name, storage)
-                    with open(zip_temp_path, "rb") as zip_file:
-                        storage.save_file(zip_file, zip_path)
-                    task = ingest.delay(
-                        settings.UPLOAD_FOLDER,
-                        [
-                            ".rst",
-                            ".md",
-                            ".pdf",
-                            ".txt",
-                            ".docx",
-                            ".csv",
-                            ".epub",
-                            ".html",
-                            ".mdx",
-                            ".json",
-                            ".xlsx",
-                            ".pptx",
-                            ".png",
-                            ".jpg",
-                            ".jpeg",
-                        ],
-                        job_name,
-                        zip_filename,
-                        user,
-                        dir_name,
-                        safe_user,
-                    )
-                finally:
-                    # Clean up temporary files
-
-                    for temp_path in temp_files:
-                        try:
-                            storage.delete_file(temp_path)
-                        except Exception as e:
-                            current_app.logger.error(
-                                f"Error deleting temporary file {temp_path}: {e}",
-                                exc_info=True,
-                            )
-                    # Clean up the zip file if it was created
-
-                    if zip_temp_path and os.path.exists(zip_temp_path):
-                        os.remove(zip_temp_path)
-            else:  # Keep this else block for single file upload
-                # For single file
-
-                file = files[0]
-                filename = safe_filename(file.filename)
-                file_path = f"{base_path}/{filename}"
-
-                storage.save_file(file, file_path)
-
-                task = ingest.delay(
-                    settings.UPLOAD_FOLDER,
-                    [
-                        ".rst",
-                        ".md",
-                        ".pdf",
-                        ".txt",
-                        ".docx",
-                        ".csv",
-                        ".epub",
-                        ".html",
-                        ".mdx",
-                        ".json",
-                        ".xlsx",
-                        ".pptx",
-                        ".png",
-                        ".jpg",
-                        ".jpeg",
-                    ],
-                    job_name,
-                    filename,  # Corrected variable for single-file case
-                    user,
-                    dir_name,
-                    safe_user,
-                )
+                            current_app.logger.error(f"Error extracting zip: {e}", exc_info=True)
+                            # If zip extraction fails, save the original zip file
+                            file_path = f"{base_path}/{safe_file}"
+                            with open(temp_file_path, 'rb') as f:
+                                storage.save_file(f, file_path)
+                    else:
+                        # For non-zip files, save directly
+                        file_path = f"{base_path}/{safe_file}"
+                        with open(temp_file_path, 'rb') as f:
+                            storage.save_file(f, file_path)
+            
+            task = ingest.delay(
+                settings.UPLOAD_FOLDER,
+                [
+                    ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub",
+                    ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png",
+                    ".jpg", ".jpeg",
+                ],
+                job_name,
+                user,
+                file_path=base_path,
+                filename=dir_name
+            )
        except Exception as err:
            current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
            return make_response(jsonify({"success": False}), 400)
        return make_response(jsonify({"success": True, "task_id": task.id}), 200)


+@user_ns.route("/api/manage_source_files")
+class ManageSourceFiles(Resource):
+    @api.expect(
+        api.model(
+            "ManageSourceFilesModel",
+            {
+                "source_id": fields.String(required=True, description="Source ID to modify"),
+                "operation": fields.String(required=True, description="Operation: 'add', 'remove', or 'remove_directory'"),
+                "file_paths": fields.List(fields.String, required=False, description="File paths to remove (for remove operation)"),
+                "directory_path": fields.String(required=False, description="Directory path to remove (for remove_directory operation)"),
+                "file": fields.Raw(required=False, description="Files to add (for add operation)"),
+                "parent_dir": fields.String(required=False, description="Parent directory path relative to source root"),
+            },
+        )
+    )
+    @api.doc(
+        description="Add files, remove files, or remove directories from an existing source",
+    )
+    def post(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False, "message": "Unauthorized"}), 401)
+
+        user = decoded_token.get("sub")
+        source_id = request.form.get("source_id")
+        operation = request.form.get("operation")
+
+        if not source_id or not operation:
+            return make_response(
+                jsonify({"success": False, "message": "source_id and operation are required"}), 400
+            )
+
+        if operation not in ["add", "remove", "remove_directory"]:
+            return make_response(
+                jsonify({"success": False, "message": "operation must be 'add', 'remove', or 'remove_directory'"}), 400
+            )
+
+        try:
+            ObjectId(source_id)
+        except Exception:
+            return make_response(
+                jsonify({"success": False, "message": "Invalid source ID format"}), 400
+            )
+
+        try:
+            source = sources_collection.find_one({"_id": ObjectId(source_id), "user": user})
+            if not source:
+                return make_response(
+                    jsonify({"success": False, "message": "Source not found or access denied"}), 404
+                )
+        except Exception as err:
+            current_app.logger.error(f"Error finding source: {err}", exc_info=True)
+            return make_response(jsonify({"success": False, "message": "Database error"}), 500)
+
+        try:
+            storage = StorageCreator.get_storage()
+            source_file_path = source.get("file_path", "")
+            parent_dir = request.form.get("parent_dir", "") 
+            
+            if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
+                return make_response(
+                    jsonify({"success": False, "message": "Invalid parent directory path"}), 400
+                )
+
+            if operation == "add":
+                files = request.files.getlist("file")
+                if not files or all(file.filename == "" for file in files):
+                    return make_response(
+                        jsonify({"success": False, "message": "No files provided for add operation"}), 400
+                    )
+
+                added_files = []
+                
+                target_dir = source_file_path
+                if parent_dir:
+                    target_dir = f"{source_file_path}/{parent_dir}"
+
+                for file in files:
+                    if file.filename:
+                        safe_filename_str = safe_filename(file.filename)
+                        file_path = f"{target_dir}/{safe_filename_str}"
+
+                        # Save file to storage
+                        storage.save_file(file, file_path)
+                        added_files.append(safe_filename_str)
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Added {len(added_files)} files",
+                    "added_files": added_files,
+                    "parent_dir": parent_dir,
+                    "reingest_task_id": task.id
+                }), 200)
+
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths")
+                if not file_paths_str:
+                    return make_response(
+                        jsonify({"success": False, "message": "file_paths required for remove operation"}), 400
+                    )
+
+                try:
+                    file_paths = json.loads(file_paths_str) if isinstance(file_paths_str, str) else file_paths_str
+                except Exception:
+                    return make_response(
+                        jsonify({"success": False, "message": "Invalid file_paths format"}), 400
+                    )
+
+                # Remove files from storage and directory structure
+                removed_files = []
+                for file_path in file_paths:
+                    full_path = f"{source_file_path}/{file_path}"
+
+                    # Remove from storage
+                    if storage.file_exists(full_path):
+                        storage.delete_file(full_path)
+                        removed_files.append(file_path)
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Removed {len(removed_files)} files",
+                    "removed_files": removed_files,
+                    "reingest_task_id": task.id
+                }), 200)
+
+            elif operation == "remove_directory":
+                directory_path = request.form.get("directory_path")
+                if not directory_path:
+                    return make_response(
+                        jsonify({"success": False, "message": "directory_path required for remove_directory operation"}), 400
+                    )
+
+                # Validate directory path (prevent path traversal)
+                if directory_path.startswith("/") or ".." in directory_path:
+                    current_app.logger.warning(
+                        f"Invalid directory path attempted for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Invalid directory path"}), 400
+                    )
+
+                full_directory_path = f"{source_file_path}/{directory_path}" if directory_path else source_file_path
+
+                if not storage.is_directory(full_directory_path):
+                    current_app.logger.warning(
+                        f"Directory not found or is not a directory for removal. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Directory not found or is not a directory"}), 404
+                    )
+
+                success = storage.remove_directory(full_directory_path)
+
+                if not success:
+                    current_app.logger.error(
+                        f"Failed to remove directory from storage. "
+                        f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                        f"Full path: {full_directory_path}"
+                    )
+                    return make_response(
+                        jsonify({"success": False, "message": "Failed to remove directory"}), 500
+                    )
+
+                current_app.logger.info(
+                    f"Successfully removed directory. "
+                    f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
+                    f"Full path: {full_directory_path}"
+                )
+
+                # Trigger re-ingestion pipeline
+                from application.api.user.tasks import reingest_source_task
+
+                task = reingest_source_task.delay(source_id=source_id, user=user)
+
+                return make_response(jsonify({
+                    "success": True,
+                    "message": f"Successfully removed directory: {directory_path}",
+                    "removed_directory": directory_path,
+                    "reingest_task_id": task.id
+                }), 200)
+
+        except Exception as err:
+            error_context = f"operation={operation}, user={user}, source_id={source_id}"
+            if operation == "remove_directory":
+                directory_path = request.form.get("directory_path", "")
+                error_context += f", directory_path={directory_path}"
+            elif operation == "remove":
+                file_paths_str = request.form.get("file_paths", "")
+                error_context += f", file_paths={file_paths_str}"
+            elif operation == "add":
+                parent_dir = request.form.get("parent_dir", "")
+                error_context += f", parent_dir={parent_dir}"
+
+            current_app.logger.error(f"Error managing source files: {err} ({error_context})", exc_info=True)
+            return make_response(jsonify({"success": False, "message": "Operation failed"}), 500)
+
+
@user_ns.route("/api/remote")
 class UploadRemote(Resource):
    @api.expect(
@@ -834,6 +984,7 @@ class PaginatedSources(Resource):
                    "tokens": doc.get("tokens", ""),
                    "retriever": doc.get("retriever", "classic"),
                    "syncFrequency": doc.get("sync_frequency", ""),
+                    "isNested": bool(doc.get("directory_structure"))
                }
                paginated_docs.append(doc_data)
            response = {
@@ -881,6 +1032,7 @@ class CombinedJson(Resource):
                        "tokens": index.get("tokens", ""),
                        "retriever": index.get("retriever", "classic"),
                        "syncFrequency": index.get("sync_frequency", ""),
+                        "is_nested": bool(index.get("directory_structure"))
                    }
                )
        except Exception as err:
@@ -3374,8 +3526,14 @@ class DeleteTool(Resource):
@user_ns.route("/api/get_chunks")
 class GetChunks(Resource):
    @api.doc(
-        description="Retrieves all chunks associated with a document",
-        params={"id": "The document ID"},
+        description="Retrieves chunks from a document, optionally filtered by file path and search term",
+        params={
+            "id": "The document ID",
+            "page": "Page number for pagination",
+            "per_page": "Number of chunks per page",
+            "path": "Optional: Filter chunks by relative file path",
+            "search": "Optional: Search term to filter chunks by title or content"
+        },
    )
    def get(self):
        decoded_token = request.decoded_token
@@ -3385,6 +3543,8 @@ class GetChunks(Resource):
        doc_id = request.args.get("id")
        page = int(request.args.get("page", 1))
        per_page = int(request.args.get("per_page", 10))
+        path = request.args.get("path")
+        search_term = request.args.get("search", "").strip().lower()

        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3396,6 +3556,30 @@ class GetChunks(Resource):
        try:
            store = get_vector_store(doc_id)
            chunks = store.get_chunks()
+            
+            filtered_chunks = []
+            for chunk in chunks:
+                metadata = chunk.get("metadata", {})
+
+                # Filter by path if provided
+                if path:
+                    chunk_source = metadata.get("source", "")
+                    # Check if the chunk's source matches the requested path
+                    if not chunk_source or not chunk_source.endswith(path):
+                        continue
+
+                # Filter by search term if provided
+                if search_term:
+                    text_match = search_term in chunk.get("text", "").lower()
+                    title_match = search_term in metadata.get("title", "").lower()
+
+                    if not (text_match or title_match):
+                        continue
+
+                filtered_chunks.append(chunk)
+            
+            chunks = filtered_chunks
+            
            total_chunks = len(chunks)
            start = (page - 1) * per_page
            end = start + per_page
@@ -3408,6 +3592,8 @@ class GetChunks(Resource):
                        "per_page": per_page,
                        "total": total_chunks,
                        "chunks": paginated_chunks,
+                        "path": path if path else None,
+                        "search": search_term if search_term else None
                    }
                ),
                200,
@@ -3416,7 +3602,6 @@ class GetChunks(Resource):
            current_app.logger.error(f"Error getting chunks: {e}", exc_info=True)
            return make_response(jsonify({"success": False}), 500)

-
@user_ns.route("/api/add_chunk")
 class AddChunk(Resource):
    @api.expect(
@@ -3448,6 +3633,8 @@ class AddChunk(Resource):
        doc_id = data.get("id")
        text = data.get("text")
        metadata = data.get("metadata", {})
+        token_count = num_tokens_from_string(text)
+        metadata["token_count"] = token_count

        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3544,6 +3731,12 @@ class UpdateChunk(Resource):
        text = data.get("text")
        metadata = data.get("metadata")

+        if text is not None:
+            token_count = num_tokens_from_string(text)
+            if metadata is None:
+                metadata = {}
+            metadata["token_count"] = token_count
+
        if not ObjectId.is_valid(doc_id):
            return make_response(jsonify({"error": "Invalid doc_id"}), 400)
        doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
@@ -3553,31 +3746,45 @@ class UpdateChunk(Resource):
            )
        try:
            store = get_vector_store(doc_id)
+
            chunks = store.get_chunks()
            existing_chunk = next((c for c in chunks if c["doc_id"] == chunk_id), None)
            if not existing_chunk:
                return make_response(jsonify({"error": "Chunk not found"}), 404)
-            deleted = store.delete_chunk(chunk_id)
-            if not deleted:
-                return make_response(
-                    jsonify({"error": "Failed to delete existing chunk"}), 500
-                )
+
            new_text = text if text is not None else existing_chunk["text"]
-            new_metadata = (
-                metadata if metadata is not None else existing_chunk["metadata"]
-            )

-            new_chunk_id = store.add_chunk(new_text, new_metadata)
+            if metadata is not None:
+                new_metadata = existing_chunk["metadata"].copy()
+                new_metadata.update(metadata)
+            else:
+                new_metadata = existing_chunk["metadata"].copy()

-            return make_response(
-                jsonify(
-                    {
-                        "message": "Chunk updated successfully",
-                        "new_chunk_id": new_chunk_id,
-                    }
-                ),
-                200,
-            )
+            if text is not None:
+                new_metadata["token_count"] = num_tokens_from_string(new_text)
+
+            try:
+                new_chunk_id = store.add_chunk(new_text, new_metadata)
+
+                deleted = store.delete_chunk(chunk_id)
+                if not deleted:
+                    current_app.logger.warning(f"Failed to delete old chunk {chunk_id}, but new chunk {new_chunk_id} was created")
+
+                return make_response(
+                    jsonify(
+                        {
+                            "message": "Chunk updated successfully",
+                            "chunk_id": new_chunk_id,
+                            "original_chunk_id": chunk_id,
+                        }
+                    ),
+                    200,
+                )
+            except Exception as add_error:
+                current_app.logger.error(f"Failed to add updated chunk: {add_error}")
+                return make_response(
+                    jsonify({"error": "Failed to update chunk - addition failed"}), 500
+                )
        except Exception as e:
            current_app.logger.error(f"Error updating chunk: {e}", exc_info=True)
            return make_response(jsonify({"success": False}), 500)
@@ -3681,3 +3888,51 @@ class ServeImage(Resource):
            return make_response(
                jsonify({"success": False, "message": "Error retrieving image"}), 500
            )
+
+
+@user_ns.route("/api/directory_structure")
+class DirectoryStructure(Resource):
+    @api.doc(
+        description="Get the directory structure for a document",
+        params={"id": "The document ID"},
+    )
+    def get(self):
+        decoded_token = request.decoded_token
+        if not decoded_token:
+            return make_response(jsonify({"success": False}), 401)
+        
+        user = decoded_token.get("sub")
+        doc_id = request.args.get("id")
+        
+        if not doc_id:
+            return make_response(
+                jsonify({"error": "Document ID is required"}), 400
+            )
+            
+        if not ObjectId.is_valid(doc_id):
+            return make_response(jsonify({"error": "Invalid document ID"}), 400)
+            
+        try:
+            doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
+            if not doc:
+                return make_response(
+                    jsonify({"error": "Document not found or access denied"}), 404
+                )
+                
+            directory_structure = doc.get("directory_structure", {})
+            
+            return make_response(
+                jsonify({
+                    "success": True,
+                    "directory_structure": directory_structure,
+                    "base_path": doc.get("file_path", "")
+                }), 200
+            )
+            
+        except Exception as e:
+            current_app.logger.error(
+                f"Error retrieving directory structure: {e}", exc_info=True
+            )
+            return make_response(
+                jsonify({"success": False, "error": str(e)}), 500
+            )
--- a/application/api/user/tasks.py
+++ b/application/api/user/tasks.py
@@ -11,8 +11,8 @@ from application.worker import (


@celery.task(bind=True)
-def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir):
-    resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir)
+def ingest(self, directory, formats, job_name, user, file_path, filename):
+    resp = ingest_worker(self, directory, formats, job_name, file_path, filename, user)
    return resp


@@ -22,6 +22,13 @@ def ingest_remote(self, source_data, job_name, user, loader):
    return resp


+@celery.task(bind=True)
+def reingest_source_task(self, source_id, user):
+    from application.worker import reingest_source_worker
+    resp = reingest_source_worker(self, source_id, user)
+    return resp
+
+
@celery.task(bind=True)
 def schedule_syncs(self, frequency):
    resp = sync_worker(self, frequency)
--- a/application/parser/chunking.py
+++ b/application/parser/chunking.py
@@ -32,16 +32,7 @@ class Chunker:
            header, body = "", text  # No header, treat entire text as body
        return header, body

-    def combine_documents(self, doc: Document, next_doc: Document) -> Document:
-        combined_text = doc.text + " " + next_doc.text
-        combined_token_count = len(self.encoding.encode(combined_text))
-        new_doc = Document(
-            text=combined_text,
-            doc_id=doc.doc_id,
-            embedding=doc.embedding,
-            extra_info={**(doc.extra_info or {}), "token_count": combined_token_count}
-        )
-        return new_doc
+
    
    def split_document(self, doc: Document) -> List[Document]:
        split_docs = []
@@ -82,26 +73,11 @@ class Chunker:
                processed_docs.append(doc)
                i += 1
            elif token_count < self.min_tokens:
-                if i + 1 < len(documents):
-                    next_doc = documents[i + 1]
-                    next_tokens = self.encoding.encode(next_doc.text)
-                    if token_count + len(next_tokens) <= self.max_tokens:
-                        # Combine small documents
-                        combined_doc = self.combine_documents(doc, next_doc)
-                        processed_docs.append(combined_doc)
-                        i += 2
-                    else:
-                        # Keep the small document as is if adding next_doc would exceed max_tokens
-                        doc.extra_info = doc.extra_info or {}
-                        doc.extra_info["token_count"] = token_count
-                        processed_docs.append(doc)
-                        i += 1
-                else:
-                    # No next document to combine with; add the small document as is
-                    doc.extra_info = doc.extra_info or {}
-                    doc.extra_info["token_count"] = token_count
-                    processed_docs.append(doc)
-                    i += 1
+  
+                doc.extra_info = doc.extra_info or {}
+                doc.extra_info["token_count"] = token_count
+                processed_docs.append(doc)
+                i += 1
            else:
                # Split large documents
                processed_docs.extend(self.split_document(doc))
--- a/application/parser/embedding_pipeline.py
+++ b/application/parser/embedding_pipeline.py
@@ -46,7 +46,7 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status):
        store = VectorCreator.create_vectorstore(
            settings.VECTOR_STORE,
            docs_init=docs_init,
-            source_id=folder_name,
+            source_id=source_id,
            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
        )
    else:
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
 from application.parser.file.image_parser import ImageParser
 from application.parser.schema.base import Document
+from application.utils import num_tokens_from_string

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".pdf": PDFParser(),
@@ -141,11 +142,12 @@ class SimpleDirectoryReader(BaseReader):

        Returns:
            List[Document]: A list of documents.
-
        """
        data: Union[str, List[str]] = ""
        data_list: List[str] = []
        metadata_list = []
+        self.file_token_counts = {}
+        
        for input_file in self.input_files:
            if input_file.suffix in self.file_extractor:
                parser = self.file_extractor[input_file.suffix]
@@ -156,24 +158,48 @@ class SimpleDirectoryReader(BaseReader):
                # do standard read
                with open(input_file, "r", errors=self.errors) as f:
                    data = f.read()
-            # Prepare metadata for this file
-            if self.file_metadata is not None:
-                file_metadata = self.file_metadata(input_file.name)
+            
+            # Calculate token count for this file
+            if isinstance(data, List):
+                file_tokens = sum(num_tokens_from_string(str(d)) for d in data)
            else:
-                # Provide a default empty metadata
-                file_metadata = {'title': '', 'store': ''}
-                # TODO: Find a case with no metadata and check if breaks anything 
+                file_tokens = num_tokens_from_string(str(data))
+            
+            full_path = str(input_file.resolve())
+            self.file_token_counts[full_path] = file_tokens
+            
+            base_metadata = {
+                'title': input_file.name,
+                'token_count': file_tokens,
+            }
+            
+            if hasattr(self, 'input_dir'):
+                try:
+                    relative_path = str(input_file.relative_to(self.input_dir))
+                    base_metadata['source'] = relative_path
+                except ValueError:
+                    base_metadata['source'] = str(input_file)
+            else:
+                base_metadata['source'] = str(input_file)
+
+            if self.file_metadata is not None:
+                custom_metadata = self.file_metadata(input_file.name)
+                base_metadata.update(custom_metadata)

            if isinstance(data, List):
                # Extend data_list with each item in the data list
                data_list.extend([str(d) for d in data])
-                # For each item in the data list, add the file's metadata to metadata_list
-                metadata_list.extend([file_metadata for _ in data])
+                metadata_list.extend([base_metadata for _ in data])
            else:
-                # Add the single piece of data to data_list
                data_list.append(str(data))
-                # Add the file's metadata to metadata_list
-                metadata_list.append(file_metadata)
+                metadata_list.append(base_metadata)
+        
+        # Build directory structure if input_dir is provided
+        if hasattr(self, 'input_dir'):
+            self.directory_structure = self.build_directory_structure(self.input_dir)
+            logging.info("Directory structure built successfully")
+        else:
+            self.directory_structure = {}

        if concatenate:
            return [Document("\n".join(data_list))]
@@ -181,3 +207,48 @@ class SimpleDirectoryReader(BaseReader):
            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
        else:
            return [Document(d) for d in data_list]
+
+    def build_directory_structure(self, base_path):
+        """Build a dictionary representing the directory structure.
+
+        Args:
+            base_path: The base path to start building the structure from.
+
+        Returns:
+            dict: A nested dictionary representing the directory structure.
+        """
+        import mimetypes
+        
+        def build_tree(path):
+            """Helper function to recursively build the directory tree."""
+            result = {}
+            
+            for item in path.iterdir():
+                if self.exclude_hidden and item.name.startswith('.'):
+                    continue
+                    
+                if item.is_dir():
+                    subtree = build_tree(item)
+                    if subtree:
+                        result[item.name] = subtree
+                else:
+                    if self.required_exts is not None and item.suffix not in self.required_exts:
+                        continue
+                    
+                    full_path = str(item.resolve())
+                    file_size_bytes = item.stat().st_size
+                    mime_type = mimetypes.guess_type(item.name)[0] or "application/octet-stream"
+                    
+                    file_info = {
+                        "type": mime_type,
+                        "size_bytes": file_size_bytes
+                    }
+                    
+                    if hasattr(self, 'file_token_counts') and full_path in self.file_token_counts:
+                        file_info["token_count"] = self.file_token_counts[full_path]
+                        
+                    result[item.name] = file_info
+                    
+            return result
+        
+        return build_tree(Path(base_path))
--- a/application/storage/base.py
+++ b/application/storage/base.py
@@ -93,3 +93,32 @@ class BaseStorage(ABC):
            List[str]: List of file paths
        """
        pass
+        
+    @abstractmethod
+    def is_directory(self, path: str) -> bool:
+        """
+        Check if a path is a directory.
+
+        Args:
+            path: Path to check
+
+        Returns:
+            bool: True if the path is a directory
+        """
+        pass
+
+    @abstractmethod
+    def remove_directory(self, directory: str) -> bool:
+        """
+        Remove a directory and all its contents.
+
+        For local storage, this removes the directory and all files/subdirectories within it.
+        For S3 storage, this removes all objects with the directory path as a prefix.
+
+        Args:
+            directory: Directory path to remove
+
+        Returns:
+            bool: True if removal was successful, False otherwise
+        """
+        pass
--- a/application/storage/local.py
+++ b/application/storage/local.py
@@ -101,3 +101,40 @@ class LocalStorage(BaseStorage):
            raise FileNotFoundError(f"File not found: {full_path}")

        return processor_func(local_path=full_path, **kwargs)
+
+    def is_directory(self, path: str) -> bool:
+        """
+        Check if a path is a directory in local storage.
+        
+        Args:
+            path: Path to check
+        
+        Returns:
+            bool: True if the path is a directory, False otherwise
+        """
+        full_path = self._get_full_path(path)
+        return os.path.isdir(full_path)
+
+    def remove_directory(self, directory: str) -> bool:
+        """
+        Remove a directory and all its contents from local storage.
+
+        Args:
+            directory: Directory path to remove
+
+        Returns:
+            bool: True if removal was successful, False otherwise
+        """
+        full_path = self._get_full_path(directory)
+
+        if not os.path.exists(full_path):
+            return False
+
+        if not os.path.isdir(full_path):
+            return False
+
+        try:
+            shutil.rmtree(full_path)
+            return True
+        except (OSError, PermissionError):
+            return False
--- a/application/storage/s3.py
+++ b/application/storage/s3.py
@@ -130,3 +130,77 @@ class S3Storage(BaseStorage):
            except Exception as e:
                logging.error(f"Error processing S3 file {path}: {e}", exc_info=True)
                raise
+
+    def is_directory(self, path: str) -> bool:
+        """
+        Check if a path is a directory in S3 storage.
+
+        In S3, directories are virtual concepts. A path is considered a directory
+        if there are objects with the path as a prefix.
+
+        Args:
+            path: Path to check
+
+        Returns:
+            bool: True if the path is a directory, False otherwise
+        """
+        # Ensure path ends with a slash if not empty
+        if path and not path.endswith('/'):
+            path += '/'
+
+        response = self.s3.list_objects_v2(
+            Bucket=self.bucket_name,
+            Prefix=path,
+            MaxKeys=1
+        )
+
+        return 'Contents' in response
+
+    def remove_directory(self, directory: str) -> bool:
+        """
+        Remove a directory and all its contents from S3 storage.
+
+        In S3, this removes all objects with the directory path as a prefix.
+        Since S3 doesn't have actual directories, this effectively removes
+        all files within the virtual directory structure.
+
+        Args:
+            directory: Directory path to remove
+
+        Returns:
+            bool: True if removal was successful, False otherwise
+        """
+        # Ensure directory ends with a slash if not empty
+        if directory and not directory.endswith('/'):
+            directory += '/'
+
+        try:
+            # Get all objects with the directory prefix
+            objects_to_delete = []
+            paginator = self.s3.get_paginator('list_objects_v2')
+            pages = paginator.paginate(Bucket=self.bucket_name, Prefix=directory)
+
+            for page in pages:
+                if 'Contents' in page:
+                    for obj in page['Contents']:
+                        objects_to_delete.append({'Key': obj['Key']})
+
+            if not objects_to_delete:
+                return False
+
+            batch_size = 1000
+            for i in range(0, len(objects_to_delete), batch_size):
+                batch = objects_to_delete[i:i + batch_size]
+
+                response = self.s3.delete_objects(
+                    Bucket=self.bucket_name,
+                    Delete={'Objects': batch}
+                )
+
+                if 'Errors' in response and response['Errors']:
+                    return False
+
+            return True
+
+        except ClientError:
+            return False
--- a/application/vectorstore/faiss.py
+++ b/application/vectorstore/faiss.py
@@ -1,5 +1,6 @@
 import os
 import tempfile
+import io

 from langchain_community.vectorstores import FAISS

@@ -66,8 +67,37 @@ class FaissStore(BaseVectorStore):
    def add_texts(self, *args, **kwargs):
        return self.docsearch.add_texts(*args, **kwargs)

-    def save_local(self, *args, **kwargs):
-        return self.docsearch.save_local(*args, **kwargs)
+    def _save_to_storage(self):
+        """
+        Save the FAISS index to storage using temporary directory pattern.
+        Works consistently for both local and S3 storage.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            self.docsearch.save_local(temp_dir)
+
+            faiss_path = os.path.join(temp_dir, "index.faiss")
+            pkl_path = os.path.join(temp_dir, "index.pkl")
+
+            with open(faiss_path, "rb") as f_faiss:
+                faiss_data = f_faiss.read()
+
+            with open(pkl_path, "rb") as f_pkl:
+                pkl_data = f_pkl.read()
+
+            storage_path = get_vectorstore(self.source_id)
+            self.storage.save_file(io.BytesIO(faiss_data), f"{storage_path}/index.faiss")
+            self.storage.save_file(io.BytesIO(pkl_data), f"{storage_path}/index.pkl")
+
+        return True
+
+    def save_local(self, path=None):
+        if path:
+            os.makedirs(path, exist_ok=True)
+            self.docsearch.save_local(path)
+
+        self._save_to_storage()
+
+        return True

    def delete_index(self, *args, **kwargs):
        return self.docsearch.delete(*args, **kwargs)
@@ -103,13 +133,17 @@ class FaissStore(BaseVectorStore):
        return chunks

    def add_chunk(self, text, metadata=None):
+        """Add a new chunk and save to storage."""
        metadata = metadata or {}
        doc = Document(text=text, extra_info=metadata).to_langchain_format()
        doc_id = self.docsearch.add_documents([doc])
-        self.save_local(self.path)
+        self._save_to_storage()
        return doc_id

+
+
    def delete_chunk(self, chunk_id):
+        """Delete a chunk and save to storage."""
        self.delete_index([chunk_id])
-        self.save_local(self.path)
+        self._save_to_storage()
        return True
--- a/application/worker.py
+++ b/application/worker.py
@@ -103,11 +103,23 @@ def download_file(url, params, dest_path):


 def upload_index(full_path, file_data):
+    files = None
    try:
        if settings.VECTOR_STORE == "faiss":
+            faiss_path = full_path + "/index.faiss"
+            pkl_path = full_path + "/index.pkl"
+
+            if not os.path.exists(faiss_path):
+                logging.error(f"FAISS index file not found: {faiss_path}")
+                raise FileNotFoundError(f"FAISS index file not found: {faiss_path}")
+
+            if not os.path.exists(pkl_path):
+                logging.error(f"FAISS pickle file not found: {pkl_path}")
+                raise FileNotFoundError(f"FAISS pickle file not found: {pkl_path}")
+
            files = {
-                "file_faiss": open(full_path + "/index.faiss", "rb"),
-                "file_pkl": open(full_path + "/index.pkl", "rb"),
+                "file_faiss": open(faiss_path, "rb"),
+                "file_pkl": open(pkl_path, "rb"),
            }
            response = requests.post(
                urljoin(settings.API_URL, "/api/upload_index"),
@@ -119,11 +131,11 @@ def upload_index(full_path, file_data):
                urljoin(settings.API_URL, "/api/upload_index"), data=file_data
            )
        response.raise_for_status()
-    except requests.RequestException as e:
+    except (requests.RequestException, FileNotFoundError) as e:
        logging.error(f"Error uploading index: {e}")
        raise
    finally:
-        if settings.VECTOR_STORE == "faiss":
+        if settings.VECTOR_STORE == "faiss" and files is not None:
            for file in files.values():
                file.close()

@@ -200,15 +212,8 @@ def run_agent_logic(agent_config, input_data):


 def ingest_worker(
-    self,
-    directory,
-    formats,
-    job_name,
-    filename,
-    user,
-    dir_name=None,
-    user_dir=None,
-    retriever="classic",
+    self, directory, formats, job_name, file_path, filename, user, 
+    retriever="classic"
 ):
    """
    Ingest and process documents.
@@ -218,10 +223,9 @@ def ingest_worker(
        directory (str): Specifies the directory for ingesting ('inputs' or 'temp').
        formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]).
        job_name (str): Name of the job for this ingestion task (original, unsanitized).
-        filename (str): Name of the file to be ingested.
+        file_path (str): Complete file path to use consistently throughout the pipeline.
+        filename (str): Original unsanitized filename provided by the user.
        user (str): Identifier for the user initiating the ingestion (original, unsanitized).
-        dir_name (str, optional): Sanitized directory name for filesystem operations.
-        user_dir (str, optional): Sanitized user ID for filesystem operations.
        retriever (str): Type of retriever to use for processing the documents.

    Returns:
@@ -234,11 +238,8 @@ def ingest_worker(
    sample = False

    storage = StorageCreator.get_storage()
-
-    full_path = os.path.join(directory, user_dir, dir_name)
-    source_file_path = os.path.join(full_path, filename)
-
-    logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name})
+    
+    logging.info(f"Ingest path: {file_path}", extra={"user": user, "job": job_name})

    # Create temporary working directory

@@ -246,22 +247,46 @@ def ingest_worker(
        try:
            os.makedirs(temp_dir, exist_ok=True)

-            # Download file from storage to temp directory
+            if storage.is_directory(file_path):
+                # Handle directory case
+                logging.info(f"Processing directory: {file_path}")
+                files_list = storage.list_files(file_path)
+                
+                for storage_file_path in files_list:
+                    if storage.is_directory(storage_file_path):
+                        continue
+                        
+                    # Create relative path structure in temp directory
+                    rel_path = os.path.relpath(storage_file_path, file_path)
+                    local_file_path = os.path.join(temp_dir, rel_path)
+                    
+                    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+                    
+                    # Download file
+                    try:
+                        file_data = storage.get_file(storage_file_path)
+                        with open(local_file_path, "wb") as f:
+                            f.write(file_data.read())
+                    except Exception as e:
+                        logging.error(f"Error downloading file {storage_file_path}: {e}")
+                        continue
+            else:
+                # Handle single file case
+                temp_filename = os.path.basename(file_path)
+                temp_file_path = os.path.join(temp_dir, temp_filename)
+                
+                file_data = storage.get_file(file_path)
+                with open(temp_file_path, "wb") as f:
+                    f.write(file_data.read())

-            temp_file_path = os.path.join(temp_dir, filename)
-            file_data = storage.get_file(source_file_path)
+                # Handle zip files
+                if temp_filename.endswith(".zip"):
+                    logging.info(f"Extracting zip file: {temp_filename}")
+                    extract_zip_recursive(
+                        temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH
+                    )

-            with open(temp_file_path, "wb") as f:
-                f.write(file_data.read())
            self.update_state(state="PROGRESS", meta={"current": 1})
-
-            # Handle zip files
-
-            if filename.endswith(".zip"):
-                logging.info(f"Extracting zip file: {filename}")
-                extract_zip_recursive(
-                    temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH
-                )
            if sample:
                logging.info(f"Sample mode enabled. Using {limit} documents.")
            reader = SimpleDirectoryReader(
@@ -273,6 +298,9 @@ def ingest_worker(
                file_metadata=metadata_from_filename,
            )
            raw_docs = reader.load_data()
+            
+            directory_structure = getattr(reader, 'directory_structure', {})
+            logging.info(f"Directory structure from reader: {directory_structure}")

            chunker = Chunker(
                chunking_strategy="classic_chunk",
@@ -299,14 +327,15 @@ def ingest_worker(
                for i in range(min(5, len(raw_docs))):
                    logging.info(f"Sample document {i}: {raw_docs[i]}")
            file_data = {
-                "name": job_name,  # Use original job_name
+                "name": job_name,
                "file": filename,
-                "user": user,  # Use original user
+                "user": user,
                "tokens": tokens,
                "retriever": retriever,
                "id": str(id),
                "type": "local",
-                "original_file_path": source_file_path,
+                "file_path": file_path,
+                "directory_structure": json.dumps(directory_structure),
            }

            upload_index(vector_store_path, file_data)
@@ -323,6 +352,252 @@ def ingest_worker(
    }


+def reingest_source_worker(self, source_id, user):
+    """
+    Re-ingestion worker that handles incremental updates by:
+    1. Adding chunks from newly added files
+    2. Removing chunks from deleted files
+
+    Args:
+        self: Task instance
+        source_id: ID of the source to re-ingest
+        user: User identifier
+
+    Returns:
+        dict: Information about the re-ingestion task
+    """
+    try:
+        from application.vectorstore.vector_creator import VectorCreator
+
+        self.update_state(state="PROGRESS", meta={"current": 10, "status": "Initializing re-ingestion scan"})
+
+        source = sources_collection.find_one({"_id": ObjectId(source_id), "user": user})
+        if not source:
+            raise ValueError(f"Source {source_id} not found or access denied")
+
+        storage = StorageCreator.get_storage()
+        source_file_path = source.get("file_path", "")
+
+        self.update_state(state="PROGRESS", meta={"current": 20, "status": "Scanning current files"})
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Download all files from storage to temp directory, preserving directory structure
+            if storage.is_directory(source_file_path):
+                files_list = storage.list_files(source_file_path)
+
+                for storage_file_path in files_list:
+                    if storage.is_directory(storage_file_path):
+                        continue
+
+
+                    rel_path = os.path.relpath(storage_file_path, source_file_path)
+                    local_file_path = os.path.join(temp_dir, rel_path)
+
+                    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+
+                    # Download file
+                    try:
+                        file_data = storage.get_file(storage_file_path)
+                        with open(local_file_path, "wb") as f:
+                            f.write(file_data.read())
+                    except Exception as e:
+                        logging.error(f"Error downloading file {storage_file_path}: {e}")
+                        continue
+
+            reader = SimpleDirectoryReader(
+                input_dir=temp_dir,
+                recursive=True,
+                required_exts=[
+                    ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub",
+                    ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png",
+                    ".jpg", ".jpeg",
+                ],
+                exclude_hidden=True,
+                file_metadata=metadata_from_filename,
+            )
+            reader.load_data()
+            directory_structure = reader.directory_structure
+            logging.info(f"Directory structure built with token counts: {directory_structure}")
+
+            try:
+                old_directory_structure = source.get("directory_structure") or {}
+                if isinstance(old_directory_structure, str):
+                    try:
+                        old_directory_structure = json.loads(old_directory_structure)
+                    except Exception:
+                        old_directory_structure = {}
+
+                def _flatten_directory_structure(struct, prefix=""):
+                    files = set()
+                    if isinstance(struct, dict):
+                        for name, meta in struct.items():
+                            current_path = os.path.join(prefix, name) if prefix else name
+                            if isinstance(meta, dict) and ("type" in meta and "size_bytes" in meta):
+                                files.add(current_path)
+                            elif isinstance(meta, dict):
+                                files |= _flatten_directory_structure(meta, current_path)
+                    return files
+
+                old_files = _flatten_directory_structure(old_directory_structure)
+                new_files = _flatten_directory_structure(directory_structure)
+
+                added_files = sorted(new_files - old_files)
+                removed_files = sorted(old_files - new_files)
+
+                if added_files:
+                    logging.info(f"Files added since last ingest: {added_files}")
+                else:
+                    logging.info("No files added since last ingest.")
+
+                if removed_files:
+                    logging.info(f"Files removed since last ingest: {removed_files}")
+                else:
+                    logging.info("No files removed since last ingest.")
+
+            except Exception as e:
+                logging.error(f"Error comparing directory structures: {e}", exc_info=True)
+                added_files = []
+                removed_files = []
+            try:
+                if not added_files and not removed_files:
+                    logging.info("No changes detected.")
+                    return {
+                        "source_id": source_id,
+                        "user": user,
+                        "status": "no_changes",
+                        "added_files": [],
+                        "removed_files": [],
+                    }
+
+                vector_store = VectorCreator.create_vectorstore(
+                    settings.VECTOR_STORE,
+                    source_id,
+                    settings.EMBEDDINGS_KEY,
+                )
+
+                self.update_state(state="PROGRESS", meta={"current": 40, "status": "Processing file changes"})
+
+                # 1) Delete chunks from removed files
+                deleted = 0
+                if removed_files:
+                    try:
+                        for ch in vector_store.get_chunks() or []:
+                            metadata = ch.get("metadata", {}) if isinstance(ch, dict) else getattr(ch, "metadata", {})
+                            raw_source = metadata.get("source")
+
+                            source_file = str(raw_source) if raw_source else ""
+
+                            if source_file in removed_files:
+                                cid = ch.get("doc_id")
+                                if cid:
+                                    try:
+                                        vector_store.delete_chunk(cid)
+                                        deleted += 1
+                                    except Exception as de:
+                                        logging.error(f"Failed deleting chunk {cid}: {de}")
+                        logging.info(f"Deleted {deleted} chunks from {len(removed_files)} removed files")
+                    except Exception as e:
+                        logging.error(f"Error during deletion of removed file chunks: {e}", exc_info=True)
+
+                # 2) Add chunks from new files
+                added = 0
+                if added_files:
+                    try:
+                        # Build list of local files for added files only
+                        added_local_files = []
+                        for rel_path in added_files:
+                            local_path = os.path.join(temp_dir, rel_path)
+                            if os.path.isfile(local_path):
+                                added_local_files.append(local_path)
+
+                        if added_local_files:
+                            reader_new = SimpleDirectoryReader(
+                                input_files=added_local_files,
+                                exclude_hidden=True,
+                                errors="ignore",
+                                file_metadata=metadata_from_filename,
+                            )
+                            raw_docs_new = reader_new.load_data()
+                            chunker_new = Chunker(
+                                chunking_strategy="classic_chunk",
+                                max_tokens=MAX_TOKENS,
+                                min_tokens=MIN_TOKENS,
+                                duplicate_headers=False,
+                            )
+                            chunked_new = chunker_new.chunk(documents=raw_docs_new)
+
+                            for file_path, token_count in reader_new.file_token_counts.items():
+                                try:
+                                    rel_path = os.path.relpath(file_path, start=temp_dir)
+                                    path_parts = rel_path.split(os.sep)
+                                    current_dir = directory_structure
+
+                                    for part in path_parts[:-1]:
+                                        if part in current_dir and isinstance(current_dir[part], dict):
+                                            current_dir = current_dir[part]
+                                        else:
+                                            break
+
+                                    filename = path_parts[-1]
+                                    if filename in current_dir and isinstance(current_dir[filename], dict):
+                                        current_dir[filename]["token_count"] = token_count
+                                        logging.info(f"Updated token count for {rel_path}: {token_count}")
+                                except Exception as e:
+                                    logging.warning(f"Could not update token count for {file_path}: {e}")
+
+                            for d in chunked_new:
+                                meta = dict(d.extra_info or {})
+                                try:
+                                    raw_src = meta.get("source")
+                                    if isinstance(raw_src, str) and os.path.isabs(raw_src):
+                                        meta["source"] = os.path.relpath(raw_src, start=temp_dir)
+                                except Exception:
+                                    pass
+
+                                vector_store.add_chunk(d.text, metadata=meta)
+                                added += 1
+                            logging.info(f"Added {added} chunks from {len(added_files)} new files")
+                    except Exception as e:
+                        logging.error(f"Error during ingestion of new files: {e}", exc_info=True)
+
+                # 3) Update source directory structure timestamp
+                try:
+                    total_tokens = sum(reader.file_token_counts.values())
+                    
+                    sources_collection.update_one(
+                        {"_id": ObjectId(source_id)},
+                        {
+                            "$set": {
+                                "directory_structure": directory_structure,
+                                "date": datetime.datetime.now(),
+                                "tokens": total_tokens
+                            }
+                        },
+                    )
+                except Exception as e:
+                    logging.error(f"Error updating directory_structure in DB: {e}", exc_info=True)
+
+                self.update_state(state="PROGRESS", meta={"current": 100, "status": "Re-ingestion completed"})
+
+                return {
+                    "source_id": source_id,
+                    "user": user,
+                    "status": "completed",
+                    "added_files": added_files,
+                    "removed_files": removed_files,
+                    "chunks_added": added,
+                    "chunks_deleted": deleted,
+                }
+            except Exception as e:
+                logging.error(f"Error while processing file changes: {e}", exc_info=True)
+                raise
+
+
+
+    except Exception as e:
+        logging.error(f"Error in reingest_source_worker: {e}", exc_info=True)
+        raise
+
 def remote_worker(
    self,
    source_data,