From fd905b1a06cfaa839b6705686bc9171cb5adc6a0 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 2 Jul 2025 16:30:29 +0530 Subject: [PATCH] (feat:dir-reader) save tokens with filenames --- application/api/internal/routes.py | 13 +++++++++++++ application/parser/file/bulk.py | 15 +++++++++++++++ application/worker.py | 5 +++++ 3 files changed, 33 insertions(+) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 20ce31c7..3f839f40 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -1,5 +1,6 @@ import os import datetime +import json from flask import Blueprint, request, send_from_directory from werkzeug.utils import secure_filename from bson.objectid import ObjectId @@ -49,6 +50,16 @@ def upload_index_files(): sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None file_path = request.form.get("file_path") + file_token_counts = request.form.get("file_token_counts") + + if file_token_counts: + try: + file_token_counts = json.loads(file_token_counts) + except: + logger.error("Error parsing file_token_counts") + file_token_counts = {} + else: + file_token_counts = {} storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -88,6 +99,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, + "file_token_counts": file_token_counts, } }, ) @@ -106,6 +118,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, + "file_token_counts": file_token_counts, } ) return {"status": "ok"} diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index da6dc298..2851dcdd 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser from application.parser.file.pptx_parser import PPTXParser from application.parser.file.image_parser import ImageParser from application.parser.schema.base import Document +from application.utils import num_tokens_from_string DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), @@ -146,6 +147,8 @@ class SimpleDirectoryReader(BaseReader): data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] + file_token_counts = {} + for input_file in self.input_files: if input_file.suffix in self.file_extractor: parser = self.file_extractor[input_file.suffix] @@ -156,6 +159,15 @@ class SimpleDirectoryReader(BaseReader): # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() + + # Calculate token count for this file + if isinstance(data, List): + file_tokens = sum(num_tokens_from_string(str(d)) for d in data) + else: + file_tokens = num_tokens_from_string(str(data)) + + file_token_counts[input_file.name] = file_tokens + # Prepare metadata for this file if self.file_metadata is not None: file_metadata = self.file_metadata(input_file.name) @@ -175,6 +187,9 @@ class SimpleDirectoryReader(BaseReader): # Add the file's metadata to metadata_list metadata_list.append(file_metadata) + self.file_token_counts = file_token_counts + logging.info(f"File token counts: {file_token_counts}") + if concatenate: return [Document("\n".join(data_list))] elif self.file_metadata is not None: diff --git a/application/worker.py b/application/worker.py index e685b371..805fa0ed 100755 --- a/application/worker.py +++ b/application/worker.py @@ -258,6 +258,10 @@ def ingest_worker( file_metadata=metadata_from_filename, ) raw_docs = reader.load_data() + + file_token_counts = getattr(reader, 'file_token_counts', {}) + + logging.info(f"File token counts from reader: {file_token_counts}") chunker = Chunker( chunking_strategy="classic_chunk", @@ -292,6 +296,7 @@ def ingest_worker( "id": str(id), "type": "local", "file_path": file_path, + "file_token_counts": json.dumps(file_token_counts), } upload_index(vector_store_path, file_data)