(feat:dir-reader) maintain dir structure in db

This commit is contained in:
ManishMadan2882
2025-07-03 01:24:22 +05:30
parent fd905b1a06
commit 2ef23fe1b3
3 changed files with 56 additions and 16 deletions

View File

@@ -50,16 +50,16 @@ def upload_index_files():
sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
file_path = request.form.get("file_path") file_path = request.form.get("file_path")
file_token_counts = request.form.get("file_token_counts") directory_structure = request.form.get("directory_structure")
if file_token_counts: if directory_structure:
try: try:
file_token_counts = json.loads(file_token_counts) directory_structure = json.loads(directory_structure)
except: except:
logger.error("Error parsing file_token_counts") logger.error("Error parsing directory_structure")
file_token_counts = {} directory_structure = {}
else: else:
file_token_counts = {} directory_structure = {}
storage = StorageCreator.get_storage() storage = StorageCreator.get_storage()
index_base_path = f"indexes/{id}" index_base_path = f"indexes/{id}"
@@ -99,7 +99,7 @@ def upload_index_files():
"remote_data": remote_data, "remote_data": remote_data,
"sync_frequency": sync_frequency, "sync_frequency": sync_frequency,
"file_path": file_path, "file_path": file_path,
"file_token_counts": file_token_counts, "directory_structure": directory_structure,
} }
}, },
) )
@@ -118,7 +118,7 @@ def upload_index_files():
"remote_data": remote_data, "remote_data": remote_data,
"sync_frequency": sync_frequency, "sync_frequency": sync_frequency,
"file_path": file_path, "file_path": file_path,
"file_token_counts": file_token_counts, "directory_structure": directory_structure,
} }
) )
return {"status": "ok"} return {"status": "ok"}

View File

@@ -142,7 +142,6 @@ class SimpleDirectoryReader(BaseReader):
Returns: Returns:
List[Document]: A list of documents. List[Document]: A list of documents.
""" """
data: Union[str, List[str]] = "" data: Union[str, List[str]] = ""
data_list: List[str] = [] data_list: List[str] = []
@@ -188,7 +187,13 @@ class SimpleDirectoryReader(BaseReader):
metadata_list.append(file_metadata) metadata_list.append(file_metadata)
self.file_token_counts = file_token_counts self.file_token_counts = file_token_counts
logging.info(f"File token counts: {file_token_counts}")
# Build directory structure if input_dir is provided
if hasattr(self, 'input_dir'):
self.directory_structure = self._build_directory_structure(self.input_dir)
logging.info(f"Directory structure built successfully")
else:
self.directory_structure = {}
if concatenate: if concatenate:
return [Document("\n".join(data_list))] return [Document("\n".join(data_list))]
@@ -196,3 +201,39 @@ class SimpleDirectoryReader(BaseReader):
return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
else: else:
return [Document(d) for d in data_list] return [Document(d) for d in data_list]
def _build_directory_structure(self, base_path):
"""Build a dictionary representing the directory structure.
Args:
base_path: The base path to start building the structure from.
Returns:
dict: A nested dictionary representing the directory structure.
"""
structure = {}
base_path = Path(base_path)
def _build_tree(path, current_dict):
for item in path.iterdir():
if item.is_dir():
if self.exclude_hidden and item.name.startswith('.'):
continue
current_dict[item.name] = {}
_build_tree(item, current_dict[item.name])
else:
if self.exclude_hidden and item.name.startswith('.'):
continue
if self.required_exts is not None and item.suffix not in self.required_exts:
continue
# Store file with its token count if available
if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts:
current_dict[item.name] = {
"type": "file",
"token_count": self.file_token_counts[item.name]
}
else:
current_dict[item.name] = {"type": "file"}
_build_tree(base_path, structure)
return structure

View File

@@ -259,9 +259,8 @@ def ingest_worker(
) )
raw_docs = reader.load_data() raw_docs = reader.load_data()
file_token_counts = getattr(reader, 'file_token_counts', {}) directory_structure = getattr(reader, 'directory_structure', {})
logging.info(f"Directory structure from reader: {directory_structure}")
logging.info(f"File token counts from reader: {file_token_counts}")
chunker = Chunker( chunker = Chunker(
chunking_strategy="classic_chunk", chunking_strategy="classic_chunk",
@@ -288,15 +287,15 @@ def ingest_worker(
for i in range(min(5, len(raw_docs))): for i in range(min(5, len(raw_docs))):
logging.info(f"Sample document {i}: {raw_docs[i]}") logging.info(f"Sample document {i}: {raw_docs[i]}")
file_data = { file_data = {
"name": job_name, # Use original job_name "name": job_name,
"file": filename, "file": filename,
"user": user, # Use original user "user": user,
"tokens": tokens, "tokens": tokens,
"retriever": retriever, "retriever": retriever,
"id": str(id), "id": str(id),
"type": "local", "type": "local",
"file_path": file_path, "file_path": file_path,
"file_token_counts": json.dumps(file_token_counts), "directory_structure": json.dumps(directory_structure),
} }
upload_index(vector_store_path, file_data) upload_index(vector_store_path, file_data)