mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
Fixing ingestion metadata grouping
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -172,3 +172,4 @@ application/vectors/
|
|||||||
node_modules/
|
node_modules/
|
||||||
.vscode/settings.json
|
.vscode/settings.json
|
||||||
models/
|
models/
|
||||||
|
model/
|
||||||
|
|||||||
@@ -147,12 +147,24 @@ class SimpleDirectoryReader(BaseReader):
|
|||||||
# do standard read
|
# do standard read
|
||||||
with open(input_file, "r", errors=self.errors) as f:
|
with open(input_file, "r", errors=self.errors) as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
if isinstance(data, List):
|
# Prepare metadata for this file
|
||||||
data_list.extend(data)
|
|
||||||
else:
|
|
||||||
data_list.append(str(data))
|
|
||||||
if self.file_metadata is not None:
|
if self.file_metadata is not None:
|
||||||
metadata_list.append(self.file_metadata(str(input_file)))
|
file_metadata = self.file_metadata(str(input_file))
|
||||||
|
else:
|
||||||
|
# Provide a default empty metadata
|
||||||
|
file_metadata = {'title': '', 'store': ''}
|
||||||
|
# TODO: Find a case with no metadata and check if breaks anything
|
||||||
|
|
||||||
|
if isinstance(data, List):
|
||||||
|
# Extend data_list with each item in the data list
|
||||||
|
data_list.extend([str(d) for d in data])
|
||||||
|
# For each item in the data list, add the file's metadata to metadata_list
|
||||||
|
metadata_list.extend([file_metadata for _ in data])
|
||||||
|
else:
|
||||||
|
# Add the single piece of data to data_list
|
||||||
|
data_list.append(str(data))
|
||||||
|
# Add the file's metadata to metadata_list
|
||||||
|
metadata_list.append(file_metadata)
|
||||||
|
|
||||||
if concatenate:
|
if concatenate:
|
||||||
return [Document("\n".join(data_list))]
|
return [Document("\n".join(data_list))]
|
||||||
|
|||||||
@@ -21,16 +21,15 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
|||||||
for doc in documents:
|
for doc in documents:
|
||||||
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
|
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
|
||||||
|
|
||||||
if current_group is None:
|
# Check if current group is empty or if the document can be added based on token count and matching metadata
|
||||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
if current_group is None or (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len < min_tokens and current_group.extra_info == doc.extra_info):
|
||||||
extra_info=doc.extra_info)
|
if current_group is None:
|
||||||
elif len(tiktoken.get_encoding("cl100k_base").encode(
|
current_group = doc # Use the document directly to retain its metadata
|
||||||
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
|
else:
|
||||||
current_group.text += " " + doc.text
|
current_group.text += " " + doc.text # Append text to the current group
|
||||||
else:
|
else:
|
||||||
docs.append(current_group)
|
docs.append(current_group)
|
||||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
current_group = doc # Start a new group with the current document
|
||||||
extra_info=doc.extra_info)
|
|
||||||
|
|
||||||
if current_group is not None:
|
if current_group is not None:
|
||||||
docs.append(current_group)
|
docs.append(current_group)
|
||||||
|
|||||||
@@ -201,8 +201,7 @@ export default function Conversation() {
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
<p className="text-gray-595959 dark:text-bright-gray bg-white dark:bg-raisin-black w-[100vw] self-center bg-transparent p-5 text-center text-xs md:w-full">
|
<p className="text-gray-595959 dark:text-bright-gray bg-white dark:bg-raisin-black w-[100vw] self-center bg-transparent p-5 text-center text-xs md:w-full">
|
||||||
This is a chatbot that uses the GPT-3, Faiss and LangChain to answer
|
DocsGPT uses GenAI, please review critial information using sources.
|
||||||
questions.
|
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
Reference in New Issue
Block a user