From fd905b1a06cfaa839b6705686bc9171cb5adc6a0 Mon Sep 17 00:00:00 2001
From: ManishMadan2882 <manishmadan321@gmail.com>
Date: Wed, 2 Jul 2025 16:30:29 +0530
Subject: [PATCH] (feat:dir-reader) save tokens with filenames

---
 application/api/internal/routes.py | 13 +++++++++++++
 application/parser/file/bulk.py    | 15 +++++++++++++++
 application/worker.py              |  5 +++++
 3 files changed, 33 insertions(+)

diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py
index 20ce31c7..3f839f40 100755
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -1,5 +1,6 @@
 import os
 import datetime
+import json
 from flask import Blueprint, request, send_from_directory
 from werkzeug.utils import secure_filename
 from bson.objectid import ObjectId
@@ -49,6 +50,16 @@ def upload_index_files():
     sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
     
     file_path = request.form.get("file_path")
+    file_token_counts = request.form.get("file_token_counts")
+    
+    if file_token_counts:
+        try:
+            file_token_counts = json.loads(file_token_counts)
+        except:
+            logger.error("Error parsing file_token_counts")
+            file_token_counts = {}
+    else:
+        file_token_counts = {}
 
     storage = StorageCreator.get_storage()
     index_base_path = f"indexes/{id}"
@@ -88,6 +99,7 @@ def upload_index_files():
                     "remote_data": remote_data,
                     "sync_frequency": sync_frequency,
                     "file_path": file_path,
+                    "file_token_counts": file_token_counts,
                 }
             },
         )
@@ -106,6 +118,7 @@ def upload_index_files():
                 "remote_data": remote_data,
                 "sync_frequency": sync_frequency,
                 "file_path": file_path,
+                "file_token_counts": file_token_counts,
             }
         )
     return {"status": "ok"}
diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index da6dc298..2851dcdd 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
 from application.parser.file.image_parser import ImageParser
 from application.parser.schema.base import Document
+from application.utils import num_tokens_from_string
 
 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
     ".pdf": PDFParser(),
@@ -146,6 +147,8 @@ class SimpleDirectoryReader(BaseReader):
         data: Union[str, List[str]] = ""
         data_list: List[str] = []
         metadata_list = []
+        file_token_counts = {}
+        
         for input_file in self.input_files:
             if input_file.suffix in self.file_extractor:
                 parser = self.file_extractor[input_file.suffix]
@@ -156,6 +159,15 @@ class SimpleDirectoryReader(BaseReader):
                 # do standard read
                 with open(input_file, "r", errors=self.errors) as f:
                     data = f.read()
+            
+            # Calculate token count for this file
+            if isinstance(data, List):
+                file_tokens = sum(num_tokens_from_string(str(d)) for d in data)
+            else:
+                file_tokens = num_tokens_from_string(str(data))
+            
+            file_token_counts[input_file.name] = file_tokens
+            
             # Prepare metadata for this file
             if self.file_metadata is not None:
                 file_metadata = self.file_metadata(input_file.name)
@@ -175,6 +187,9 @@ class SimpleDirectoryReader(BaseReader):
                 # Add the file's metadata to metadata_list
                 metadata_list.append(file_metadata)
 
+        self.file_token_counts = file_token_counts
+        logging.info(f"File token counts: {file_token_counts}")
+
         if concatenate:
             return [Document("\n".join(data_list))]
         elif self.file_metadata is not None:
diff --git a/application/worker.py b/application/worker.py
index e685b371..805fa0ed 100755
--- a/application/worker.py
+++ b/application/worker.py
@@ -258,6 +258,10 @@ def ingest_worker(
                 file_metadata=metadata_from_filename,
             )
             raw_docs = reader.load_data()
+            
+            file_token_counts = getattr(reader, 'file_token_counts', {})
+            
+            logging.info(f"File token counts from reader: {file_token_counts}")
 
             chunker = Chunker(
                 chunking_strategy="classic_chunk",
@@ -292,6 +296,7 @@ def ingest_worker(
                 "id": str(id),
                 "type": "local",
                 "file_path": file_path,
+                "file_token_counts": json.dumps(file_token_counts),
             }
 
             upload_index(vector_store_path, file_data)