(feat:dir-reader) save tokens with filenames

2026-03-06 22:03:39 +00:00 · 2025-07-02 16:30:29 +05:30
parent ade704d065
commit fd905b1a06
3 changed files with 33 additions and 0 deletions
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@@ -1,5 +1,6 @@
 import os
 import datetime
+import json
 from flask import Blueprint, request, send_from_directory
 from werkzeug.utils import secure_filename
 from bson.objectid import ObjectId
@@ -49,6 +50,16 @@ def upload_index_files():
    sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None
    
    file_path = request.form.get("file_path")
+    file_token_counts = request.form.get("file_token_counts")
+    
+    if file_token_counts:
+        try:
+            file_token_counts = json.loads(file_token_counts)
+        except:
+            logger.error("Error parsing file_token_counts")
+            file_token_counts = {}
+    else:
+        file_token_counts = {}

    storage = StorageCreator.get_storage()
    index_base_path = f"indexes/{id}"
@@ -88,6 +99,7 @@ def upload_index_files():
                    "remote_data": remote_data,
                    "sync_frequency": sync_frequency,
                    "file_path": file_path,
+                    "file_token_counts": file_token_counts,
                }
            },
        )
@@ -106,6 +118,7 @@ def upload_index_files():
                "remote_data": remote_data,
                "sync_frequency": sync_frequency,
                "file_path": file_path,
+                "file_token_counts": file_token_counts,
            }
        )
    return {"status": "ok"}
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
 from application.parser.file.image_parser import ImageParser
 from application.parser.schema.base import Document
+from application.utils import num_tokens_from_string

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".pdf": PDFParser(),
@@ -146,6 +147,8 @@ class SimpleDirectoryReader(BaseReader):
        data: Union[str, List[str]] = ""
        data_list: List[str] = []
        metadata_list = []
+        file_token_counts = {}
+        
        for input_file in self.input_files:
            if input_file.suffix in self.file_extractor:
                parser = self.file_extractor[input_file.suffix]
@@ -156,6 +159,15 @@ class SimpleDirectoryReader(BaseReader):
                # do standard read
                with open(input_file, "r", errors=self.errors) as f:
                    data = f.read()
+            
+            # Calculate token count for this file
+            if isinstance(data, List):
+                file_tokens = sum(num_tokens_from_string(str(d)) for d in data)
+            else:
+                file_tokens = num_tokens_from_string(str(data))
+            
+            file_token_counts[input_file.name] = file_tokens
+            
            # Prepare metadata for this file
            if self.file_metadata is not None:
                file_metadata = self.file_metadata(input_file.name)
@@ -175,6 +187,9 @@ class SimpleDirectoryReader(BaseReader):
                # Add the file's metadata to metadata_list
                metadata_list.append(file_metadata)

+        self.file_token_counts = file_token_counts
+        logging.info(f"File token counts: {file_token_counts}")
+
        if concatenate:
            return [Document("\n".join(data_list))]
        elif self.file_metadata is not None:
--- a/application/worker.py
+++ b/application/worker.py
@@ -258,6 +258,10 @@ def ingest_worker(
                file_metadata=metadata_from_filename,
            )
            raw_docs = reader.load_data()
+            
+            file_token_counts = getattr(reader, 'file_token_counts', {})
+            
+            logging.info(f"File token counts from reader: {file_token_counts}")

            chunker = Chunker(
                chunking_strategy="classic_chunk",
@@ -292,6 +296,7 @@ def ingest_worker(
                "id": str(id),
                "type": "local",
                "file_path": file_path,
+                "file_token_counts": json.dumps(file_token_counts),
            }

            upload_index(vector_store_path, file_data)