mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
(feat:dir-reader) maintain dir structure in db
This commit is contained in:
@@ -50,16 +50,16 @@ def upload_index_files():
|
|||||||
sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
|
sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
|
||||||
|
|
||||||
file_path = request.form.get("file_path")
|
file_path = request.form.get("file_path")
|
||||||
file_token_counts = request.form.get("file_token_counts")
|
directory_structure = request.form.get("directory_structure")
|
||||||
|
|
||||||
if file_token_counts:
|
if directory_structure:
|
||||||
try:
|
try:
|
||||||
file_token_counts = json.loads(file_token_counts)
|
directory_structure = json.loads(directory_structure)
|
||||||
except:
|
except:
|
||||||
logger.error("Error parsing file_token_counts")
|
logger.error("Error parsing directory_structure")
|
||||||
file_token_counts = {}
|
directory_structure = {}
|
||||||
else:
|
else:
|
||||||
file_token_counts = {}
|
directory_structure = {}
|
||||||
|
|
||||||
storage = StorageCreator.get_storage()
|
storage = StorageCreator.get_storage()
|
||||||
index_base_path = f"indexes/{id}"
|
index_base_path = f"indexes/{id}"
|
||||||
@@ -99,7 +99,7 @@ def upload_index_files():
|
|||||||
"remote_data": remote_data,
|
"remote_data": remote_data,
|
||||||
"sync_frequency": sync_frequency,
|
"sync_frequency": sync_frequency,
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
"file_token_counts": file_token_counts,
|
"directory_structure": directory_structure,
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -118,7 +118,7 @@ def upload_index_files():
|
|||||||
"remote_data": remote_data,
|
"remote_data": remote_data,
|
||||||
"sync_frequency": sync_frequency,
|
"sync_frequency": sync_frequency,
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
"file_token_counts": file_token_counts,
|
"directory_structure": directory_structure,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|||||||
@@ -142,7 +142,6 @@ class SimpleDirectoryReader(BaseReader):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Document]: A list of documents.
|
List[Document]: A list of documents.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
data: Union[str, List[str]] = ""
|
data: Union[str, List[str]] = ""
|
||||||
data_list: List[str] = []
|
data_list: List[str] = []
|
||||||
@@ -188,7 +187,13 @@ class SimpleDirectoryReader(BaseReader):
|
|||||||
metadata_list.append(file_metadata)
|
metadata_list.append(file_metadata)
|
||||||
|
|
||||||
self.file_token_counts = file_token_counts
|
self.file_token_counts = file_token_counts
|
||||||
logging.info(f"File token counts: {file_token_counts}")
|
|
||||||
|
# Build directory structure if input_dir is provided
|
||||||
|
if hasattr(self, 'input_dir'):
|
||||||
|
self.directory_structure = self._build_directory_structure(self.input_dir)
|
||||||
|
logging.info(f"Directory structure built successfully")
|
||||||
|
else:
|
||||||
|
self.directory_structure = {}
|
||||||
|
|
||||||
if concatenate:
|
if concatenate:
|
||||||
return [Document("\n".join(data_list))]
|
return [Document("\n".join(data_list))]
|
||||||
@@ -196,3 +201,39 @@ class SimpleDirectoryReader(BaseReader):
|
|||||||
return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
|
return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
|
||||||
else:
|
else:
|
||||||
return [Document(d) for d in data_list]
|
return [Document(d) for d in data_list]
|
||||||
|
|
||||||
|
def _build_directory_structure(self, base_path):
|
||||||
|
"""Build a dictionary representing the directory structure.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_path: The base path to start building the structure from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A nested dictionary representing the directory structure.
|
||||||
|
"""
|
||||||
|
structure = {}
|
||||||
|
base_path = Path(base_path)
|
||||||
|
|
||||||
|
def _build_tree(path, current_dict):
|
||||||
|
for item in path.iterdir():
|
||||||
|
if item.is_dir():
|
||||||
|
if self.exclude_hidden and item.name.startswith('.'):
|
||||||
|
continue
|
||||||
|
current_dict[item.name] = {}
|
||||||
|
_build_tree(item, current_dict[item.name])
|
||||||
|
else:
|
||||||
|
if self.exclude_hidden and item.name.startswith('.'):
|
||||||
|
continue
|
||||||
|
if self.required_exts is not None and item.suffix not in self.required_exts:
|
||||||
|
continue
|
||||||
|
# Store file with its token count if available
|
||||||
|
if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts:
|
||||||
|
current_dict[item.name] = {
|
||||||
|
"type": "file",
|
||||||
|
"token_count": self.file_token_counts[item.name]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
current_dict[item.name] = {"type": "file"}
|
||||||
|
|
||||||
|
_build_tree(base_path, structure)
|
||||||
|
return structure
|
||||||
|
|||||||
@@ -259,9 +259,8 @@ def ingest_worker(
|
|||||||
)
|
)
|
||||||
raw_docs = reader.load_data()
|
raw_docs = reader.load_data()
|
||||||
|
|
||||||
file_token_counts = getattr(reader, 'file_token_counts', {})
|
directory_structure = getattr(reader, 'directory_structure', {})
|
||||||
|
logging.info(f"Directory structure from reader: {directory_structure}")
|
||||||
logging.info(f"File token counts from reader: {file_token_counts}")
|
|
||||||
|
|
||||||
chunker = Chunker(
|
chunker = Chunker(
|
||||||
chunking_strategy="classic_chunk",
|
chunking_strategy="classic_chunk",
|
||||||
@@ -288,15 +287,15 @@ def ingest_worker(
|
|||||||
for i in range(min(5, len(raw_docs))):
|
for i in range(min(5, len(raw_docs))):
|
||||||
logging.info(f"Sample document {i}: {raw_docs[i]}")
|
logging.info(f"Sample document {i}: {raw_docs[i]}")
|
||||||
file_data = {
|
file_data = {
|
||||||
"name": job_name, # Use original job_name
|
"name": job_name,
|
||||||
"file": filename,
|
"file": filename,
|
||||||
"user": user, # Use original user
|
"user": user,
|
||||||
"tokens": tokens,
|
"tokens": tokens,
|
||||||
"retriever": retriever,
|
"retriever": retriever,
|
||||||
"id": str(id),
|
"id": str(id),
|
||||||
"type": "local",
|
"type": "local",
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
"file_token_counts": json.dumps(file_token_counts),
|
"directory_structure": json.dumps(directory_structure),
|
||||||
}
|
}
|
||||||
|
|
||||||
upload_index(vector_store_path, file_data)
|
upload_index(vector_store_path, file_data)
|
||||||
|
|||||||
Reference in New Issue
Block a user