mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
(feat:dir-reader) save tokens with filenames
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import datetime
|
||||
import json
|
||||
from flask import Blueprint, request, send_from_directory
|
||||
from werkzeug.utils import secure_filename
|
||||
from bson.objectid import ObjectId
|
||||
@@ -49,6 +50,16 @@ def upload_index_files():
|
||||
sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
|
||||
|
||||
file_path = request.form.get("file_path")
|
||||
file_token_counts = request.form.get("file_token_counts")
|
||||
|
||||
if file_token_counts:
|
||||
try:
|
||||
file_token_counts = json.loads(file_token_counts)
|
||||
except:
|
||||
logger.error("Error parsing file_token_counts")
|
||||
file_token_counts = {}
|
||||
else:
|
||||
file_token_counts = {}
|
||||
|
||||
storage = StorageCreator.get_storage()
|
||||
index_base_path = f"indexes/{id}"
|
||||
@@ -88,6 +99,7 @@ def upload_index_files():
|
||||
"remote_data": remote_data,
|
||||
"sync_frequency": sync_frequency,
|
||||
"file_path": file_path,
|
||||
"file_token_counts": file_token_counts,
|
||||
}
|
||||
},
|
||||
)
|
||||
@@ -106,6 +118,7 @@ def upload_index_files():
|
||||
"remote_data": remote_data,
|
||||
"sync_frequency": sync_frequency,
|
||||
"file_path": file_path,
|
||||
"file_token_counts": file_token_counts,
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
||||
|
||||
@@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser
|
||||
from application.parser.file.pptx_parser import PPTXParser
|
||||
from application.parser.file.image_parser import ImageParser
|
||||
from application.parser.schema.base import Document
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".pdf": PDFParser(),
|
||||
@@ -146,6 +147,8 @@ class SimpleDirectoryReader(BaseReader):
|
||||
data: Union[str, List[str]] = ""
|
||||
data_list: List[str] = []
|
||||
metadata_list = []
|
||||
file_token_counts = {}
|
||||
|
||||
for input_file in self.input_files:
|
||||
if input_file.suffix in self.file_extractor:
|
||||
parser = self.file_extractor[input_file.suffix]
|
||||
@@ -156,6 +159,15 @@ class SimpleDirectoryReader(BaseReader):
|
||||
# do standard read
|
||||
with open(input_file, "r", errors=self.errors) as f:
|
||||
data = f.read()
|
||||
|
||||
# Calculate token count for this file
|
||||
if isinstance(data, List):
|
||||
file_tokens = sum(num_tokens_from_string(str(d)) for d in data)
|
||||
else:
|
||||
file_tokens = num_tokens_from_string(str(data))
|
||||
|
||||
file_token_counts[input_file.name] = file_tokens
|
||||
|
||||
# Prepare metadata for this file
|
||||
if self.file_metadata is not None:
|
||||
file_metadata = self.file_metadata(input_file.name)
|
||||
@@ -175,6 +187,9 @@ class SimpleDirectoryReader(BaseReader):
|
||||
# Add the file's metadata to metadata_list
|
||||
metadata_list.append(file_metadata)
|
||||
|
||||
self.file_token_counts = file_token_counts
|
||||
logging.info(f"File token counts: {file_token_counts}")
|
||||
|
||||
if concatenate:
|
||||
return [Document("\n".join(data_list))]
|
||||
elif self.file_metadata is not None:
|
||||
|
||||
@@ -258,6 +258,10 @@ def ingest_worker(
|
||||
file_metadata=metadata_from_filename,
|
||||
)
|
||||
raw_docs = reader.load_data()
|
||||
|
||||
file_token_counts = getattr(reader, 'file_token_counts', {})
|
||||
|
||||
logging.info(f"File token counts from reader: {file_token_counts}")
|
||||
|
||||
chunker = Chunker(
|
||||
chunking_strategy="classic_chunk",
|
||||
@@ -292,6 +296,7 @@ def ingest_worker(
|
||||
"id": str(id),
|
||||
"type": "local",
|
||||
"file_path": file_path,
|
||||
"file_token_counts": json.dumps(file_token_counts),
|
||||
}
|
||||
|
||||
upload_index(vector_store_path, file_data)
|
||||
|
||||
Reference in New Issue
Block a user