(feat:dir-reader) maintain dir structure in db

2026-03-06 22:03:39 +00:00 · 2025-07-03 01:24:22 +05:30
parent fd905b1a06
commit 2ef23fe1b3
3 changed files with 56 additions and 16 deletions
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -50,16 +50,16 @@ def upload_index_files():
    sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
    
    file_path = request.form.get("file_path")
-    file_token_counts = request.form.get("file_token_counts")
+    directory_structure = request.form.get("directory_structure")
    
-    if file_token_counts:
+    if directory_structure:
        try:
-            file_token_counts = json.loads(file_token_counts)
+            directory_structure = json.loads(directory_structure)
        except:
-            logger.error("Error parsing file_token_counts")
-            file_token_counts = {}
+            logger.error("Error parsing directory_structure")
+            directory_structure = {}
    else:
-        file_token_counts = {}
+        directory_structure = {}

    storage = StorageCreator.get_storage()
    index_base_path = f"indexes/{id}"
@@ -99,7 +99,7 @@ def upload_index_files():
                    "remote_data": remote_data,
                    "sync_frequency": sync_frequency,
                    "file_path": file_path,
-                    "file_token_counts": file_token_counts,
+                    "directory_structure": directory_structure,
                }
            },
        )
@@ -118,7 +118,7 @@ def upload_index_files():
                "remote_data": remote_data,
                "sync_frequency": sync_frequency,
                "file_path": file_path,
-                "file_token_counts": file_token_counts,
+                "directory_structure": directory_structure,
            }
        )
    return {"status": "ok"}
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -142,7 +142,6 @@ class SimpleDirectoryReader(BaseReader):

        Returns:
            List[Document]: A list of documents.
-
        """
        data: Union[str, List[str]] = ""
        data_list: List[str] = []
@@ -188,7 +187,13 @@ class SimpleDirectoryReader(BaseReader):
                metadata_list.append(file_metadata)

        self.file_token_counts = file_token_counts
-        logging.info(f"File token counts: {file_token_counts}")
+        
+        # Build directory structure if input_dir is provided
+        if hasattr(self, 'input_dir'):
+            self.directory_structure = self._build_directory_structure(self.input_dir)
+            logging.info(f"Directory structure built successfully")
+        else:
+            self.directory_structure = {}

        if concatenate:
            return [Document("\n".join(data_list))]
@@ -196,3 +201,39 @@ class SimpleDirectoryReader(BaseReader):
            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
        else:
            return [Document(d) for d in data_list]
+
+    def _build_directory_structure(self, base_path):
+        """Build a dictionary representing the directory structure.
+        
+        Args:
+            base_path: The base path to start building the structure from.
+            
+        Returns:
+            dict: A nested dictionary representing the directory structure.
+        """
+        structure = {}
+        base_path = Path(base_path)
+        
+        def _build_tree(path, current_dict):
+            for item in path.iterdir():
+                if item.is_dir():
+                    if self.exclude_hidden and item.name.startswith('.'):
+                        continue
+                    current_dict[item.name] = {}
+                    _build_tree(item, current_dict[item.name])
+                else:
+                    if self.exclude_hidden and item.name.startswith('.'):
+                        continue
+                    if self.required_exts is not None and item.suffix not in self.required_exts:
+                        continue
+                    # Store file with its token count if available
+                    if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts:
+                        current_dict[item.name] = {
+                            "type": "file",
+                            "token_count": self.file_token_counts[item.name]
+                        }
+                    else:
+                        current_dict[item.name] = {"type": "file"}
+        
+        _build_tree(base_path, structure)
+        return structure
--- a/application/worker.py
+++ b/application/worker.py
@@ -259,9 +259,8 @@ def ingest_worker(
            )
            raw_docs = reader.load_data()
            
-            file_token_counts = getattr(reader, 'file_token_counts', {})
-            
-            logging.info(f"File token counts from reader: {file_token_counts}")
+            directory_structure = getattr(reader, 'directory_structure', {})
+            logging.info(f"Directory structure from reader: {directory_structure}")

            chunker = Chunker(
                chunking_strategy="classic_chunk",
@@ -288,15 +287,15 @@ def ingest_worker(
                for i in range(min(5, len(raw_docs))):
                    logging.info(f"Sample document {i}: {raw_docs[i]}")
            file_data = {
-                "name": job_name,  # Use original job_name
+                "name": job_name,
                "file": filename,
-                "user": user,  # Use original user
+                "user": user,
                "tokens": tokens,
                "retriever": retriever,
                "id": str(id),
                "type": "local",
                "file_path": file_path,
-                "file_token_counts": json.dumps(file_token_counts),
+                "directory_structure": json.dumps(directory_structure),
            }

            upload_index(vector_store_path, file_data)