From 2ef23fe1b37822c16feaf96bf2ca92a381408a3f Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 3 Jul 2025 01:24:22 +0530 Subject: [PATCH] (feat:dir-reader) maintain dir structure in db --- application/api/internal/routes.py | 16 +++++------ application/parser/file/bulk.py | 45 ++++++++++++++++++++++++++++-- application/worker.py | 11 ++++---- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 3f839f40..da87655b 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -50,16 +50,16 @@ def upload_index_files(): sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None file_path = request.form.get("file_path") - file_token_counts = request.form.get("file_token_counts") + directory_structure = request.form.get("directory_structure") - if file_token_counts: + if directory_structure: try: - file_token_counts = json.loads(file_token_counts) + directory_structure = json.loads(directory_structure) except: - logger.error("Error parsing file_token_counts") - file_token_counts = {} + logger.error("Error parsing directory_structure") + directory_structure = {} else: - file_token_counts = {} + directory_structure = {} storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -99,7 +99,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, - "file_token_counts": file_token_counts, + "directory_structure": directory_structure, } }, ) @@ -118,7 +118,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, - "file_token_counts": file_token_counts, + "directory_structure": directory_structure, } ) return {"status": "ok"} diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 2851dcdd..85ed9404 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -142,7 +142,6 @@ class SimpleDirectoryReader(BaseReader): Returns: List[Document]: A list of documents. - """ data: Union[str, List[str]] = "" data_list: List[str] = [] @@ -188,7 +187,13 @@ class SimpleDirectoryReader(BaseReader): metadata_list.append(file_metadata) self.file_token_counts = file_token_counts - logging.info(f"File token counts: {file_token_counts}") + + # Build directory structure if input_dir is provided + if hasattr(self, 'input_dir'): + self.directory_structure = self._build_directory_structure(self.input_dir) + logging.info(f"Directory structure built successfully") + else: + self.directory_structure = {} if concatenate: return [Document("\n".join(data_list))] @@ -196,3 +201,39 @@ class SimpleDirectoryReader(BaseReader): return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list] + + def _build_directory_structure(self, base_path): + """Build a dictionary representing the directory structure. + + Args: + base_path: The base path to start building the structure from. + + Returns: + dict: A nested dictionary representing the directory structure. + """ + structure = {} + base_path = Path(base_path) + + def _build_tree(path, current_dict): + for item in path.iterdir(): + if item.is_dir(): + if self.exclude_hidden and item.name.startswith('.'): + continue + current_dict[item.name] = {} + _build_tree(item, current_dict[item.name]) + else: + if self.exclude_hidden and item.name.startswith('.'): + continue + if self.required_exts is not None and item.suffix not in self.required_exts: + continue + # Store file with its token count if available + if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts: + current_dict[item.name] = { + "type": "file", + "token_count": self.file_token_counts[item.name] + } + else: + current_dict[item.name] = {"type": "file"} + + _build_tree(base_path, structure) + return structure diff --git a/application/worker.py b/application/worker.py index 805fa0ed..8cc7ac20 100755 --- a/application/worker.py +++ b/application/worker.py @@ -259,9 +259,8 @@ def ingest_worker( ) raw_docs = reader.load_data() - file_token_counts = getattr(reader, 'file_token_counts', {}) - - logging.info(f"File token counts from reader: {file_token_counts}") + directory_structure = getattr(reader, 'directory_structure', {}) + logging.info(f"Directory structure from reader: {directory_structure}") chunker = Chunker( chunking_strategy="classic_chunk", @@ -288,15 +287,15 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { - "name": job_name, # Use original job_name + "name": job_name, "file": filename, - "user": user, # Use original user + "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": "local", "file_path": file_path, - "file_token_counts": json.dumps(file_token_counts), + "directory_structure": json.dumps(directory_structure), } upload_index(vector_store_path, file_data)