From ade704d065bae686b3e18c826a1bed144625ac0c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 1 Jul 2025 04:00:57 +0530 Subject: [PATCH 01/57] (refactor:ingestion) pass file path once --- application/api/internal/routes.py | 6 +++--- application/api/user/routes.py | 26 ++++++++++++-------------- application/api/user/tasks.py | 4 ++-- application/worker.py | 27 +++++++++++++-------------- 4 files changed, 30 insertions(+), 33 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index b4422e26..20ce31c7 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -48,7 +48,7 @@ def upload_index_files(): remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None - original_file_path = request.form.get("original_file_path") + file_path = request.form.get("file_path") storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -87,7 +87,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, - "file_path": original_file_path, + "file_path": file_path, } }, ) @@ -105,7 +105,7 @@ def upload_index_files(): "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, - "file_path": original_file_path, + "file_path": file_path, } ) return {"status": "ok"} diff --git a/application/api/user/routes.py b/application/api/user/routes.py index c2f89761..fc052421 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -553,11 +553,12 @@ class UploadFile(Resource): if len(files) > 1: temp_files = [] for file in files: - filename = safe_filename(file.filename) - temp_path = f"{base_path}/temp/{filename}" + original_filename = file.filename + safe_file = safe_filename(original_filename) + temp_path = f"{base_path}/temp/{safe_file}" storage.save_file(file, temp_path) - temp_files.append(temp_path) - print(f"Saved file: {filename}") + temp_files.append({"path": temp_path, "original_name": original_filename}) + print(f"Saved file: {original_filename}") zip_filename = f"{dir_name}.zip" zip_path = f"{base_path}/{zip_filename}" zip_temp_path = None @@ -625,14 +626,12 @@ class UploadFile(Resource): ".jpeg", ], job_name, - zip_filename, user, - dir_name, - safe_user, + file_path=zip_path, + filename=zip_filename ) finally: # Clean up temporary files - for temp_path in temp_files: try: storage.delete_file(temp_path) @@ -642,15 +641,15 @@ class UploadFile(Resource): exc_info=True, ) # Clean up the zip file if it was created - if zip_temp_path and os.path.exists(zip_temp_path): os.remove(zip_temp_path) else: # Keep this else block for single file upload # For single file file = files[0] - filename = safe_filename(file.filename) - file_path = f"{base_path}/{filename}" + original_filename = file.filename + safe_file = safe_filename(original_filename) + file_path = f"{base_path}/{safe_file}" storage.save_file(file, file_path) @@ -674,10 +673,9 @@ class UploadFile(Resource): ".jpeg", ], job_name, - filename, # Corrected variable for single-file case user, - dir_name, - safe_user, + file_path=file_path, + filename=original_filename ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index c7003ef3..aa40f37b 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -11,8 +11,8 @@ from application.worker import ( @celery.task(bind=True) -def ingest(self, directory, formats, job_name, filename, user, dir_name, user_dir): - resp = ingest_worker(self, directory, formats, job_name, filename, user, dir_name, user_dir) +def ingest(self, directory, formats, job_name, user, file_path, filename): + resp = ingest_worker(self, directory, formats, job_name, file_path, filename, user) return resp diff --git a/application/worker.py b/application/worker.py index c6178931..e685b371 100755 --- a/application/worker.py +++ b/application/worker.py @@ -194,7 +194,8 @@ def run_agent_logic(agent_config, input_data): # Define the main function for ingesting and processing documents. def ingest_worker( - self, directory, formats, job_name, filename, user, dir_name=None, user_dir=None, retriever="classic" + self, directory, formats, job_name, file_path, filename, user, + retriever="classic" ): """ Ingest and process documents. @@ -204,10 +205,9 @@ def ingest_worker( directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). job_name (str): Name of the job for this ingestion task (original, unsanitized). - filename (str): Name of the file to be ingested. + file_path (str): Complete file path to use consistently throughout the pipeline. + filename (str): Original unsanitized filename provided by the user. user (str): Identifier for the user initiating the ingestion (original, unsanitized). - dir_name (str, optional): Sanitized directory name for filesystem operations. - user_dir (str, optional): Sanitized user ID for filesystem operations. retriever (str): Type of retriever to use for processing the documents. Returns: @@ -220,11 +220,8 @@ def ingest_worker( sample = False storage = StorageCreator.get_storage() - - full_path = os.path.join(directory, user_dir, dir_name) - source_file_path = os.path.join(full_path, filename) - - logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": job_name}) + + logging.info(f"Ingest file: {file_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: @@ -232,8 +229,10 @@ def ingest_worker( os.makedirs(temp_dir, exist_ok=True) # Download file from storage to temp directory - temp_file_path = os.path.join(temp_dir, filename) - file_data = storage.get_file(source_file_path) + temp_filename = os.path.basename(file_path) + temp_file_path = os.path.join(temp_dir, temp_filename) + + file_data = storage.get_file(file_path) with open(temp_file_path, "wb") as f: f.write(file_data.read()) @@ -241,8 +240,8 @@ def ingest_worker( self.update_state(state="PROGRESS", meta={"current": 1}) # Handle zip files - if filename.endswith(".zip"): - logging.info(f"Extracting zip file: {filename}") + if temp_filename.endswith(".zip"): + logging.info(f"Extracting zip file: {temp_filename}") extract_zip_recursive( temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH ) @@ -292,7 +291,7 @@ def ingest_worker( "retriever": retriever, "id": str(id), "type": "local", - "original_file_path": source_file_path, + "file_path": file_path, } upload_index(vector_store_path, file_data) From fd905b1a06cfaa839b6705686bc9171cb5adc6a0 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 2 Jul 2025 16:30:29 +0530 Subject: [PATCH 02/57] (feat:dir-reader) save tokens with filenames --- application/api/internal/routes.py | 13 +++++++++++++ application/parser/file/bulk.py | 15 +++++++++++++++ application/worker.py | 5 +++++ 3 files changed, 33 insertions(+) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 20ce31c7..3f839f40 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -1,5 +1,6 @@ import os import datetime +import json from flask import Blueprint, request, send_from_directory from werkzeug.utils import secure_filename from bson.objectid import ObjectId @@ -49,6 +50,16 @@ def upload_index_files(): sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None file_path = request.form.get("file_path") + file_token_counts = request.form.get("file_token_counts") + + if file_token_counts: + try: + file_token_counts = json.loads(file_token_counts) + except: + logger.error("Error parsing file_token_counts") + file_token_counts = {} + else: + file_token_counts = {} storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -88,6 +99,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, + "file_token_counts": file_token_counts, } }, ) @@ -106,6 +118,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, + "file_token_counts": file_token_counts, } ) return {"status": "ok"} diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index da6dc298..2851dcdd 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -15,6 +15,7 @@ from application.parser.file.json_parser import JSONParser from application.parser.file.pptx_parser import PPTXParser from application.parser.file.image_parser import ImageParser from application.parser.schema.base import Document +from application.utils import num_tokens_from_string DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), @@ -146,6 +147,8 @@ class SimpleDirectoryReader(BaseReader): data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] + file_token_counts = {} + for input_file in self.input_files: if input_file.suffix in self.file_extractor: parser = self.file_extractor[input_file.suffix] @@ -156,6 +159,15 @@ class SimpleDirectoryReader(BaseReader): # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() + + # Calculate token count for this file + if isinstance(data, List): + file_tokens = sum(num_tokens_from_string(str(d)) for d in data) + else: + file_tokens = num_tokens_from_string(str(data)) + + file_token_counts[input_file.name] = file_tokens + # Prepare metadata for this file if self.file_metadata is not None: file_metadata = self.file_metadata(input_file.name) @@ -175,6 +187,9 @@ class SimpleDirectoryReader(BaseReader): # Add the file's metadata to metadata_list metadata_list.append(file_metadata) + self.file_token_counts = file_token_counts + logging.info(f"File token counts: {file_token_counts}") + if concatenate: return [Document("\n".join(data_list))] elif self.file_metadata is not None: diff --git a/application/worker.py b/application/worker.py index e685b371..805fa0ed 100755 --- a/application/worker.py +++ b/application/worker.py @@ -258,6 +258,10 @@ def ingest_worker( file_metadata=metadata_from_filename, ) raw_docs = reader.load_data() + + file_token_counts = getattr(reader, 'file_token_counts', {}) + + logging.info(f"File token counts from reader: {file_token_counts}") chunker = Chunker( chunking_strategy="classic_chunk", @@ -292,6 +296,7 @@ def ingest_worker( "id": str(id), "type": "local", "file_path": file_path, + "file_token_counts": json.dumps(file_token_counts), } upload_index(vector_store_path, file_data) From 2ef23fe1b37822c16feaf96bf2ca92a381408a3f Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 3 Jul 2025 01:24:22 +0530 Subject: [PATCH 03/57] (feat:dir-reader) maintain dir structure in db --- application/api/internal/routes.py | 16 +++++------ application/parser/file/bulk.py | 45 ++++++++++++++++++++++++++++-- application/worker.py | 11 ++++---- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index 3f839f40..da87655b 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -50,16 +50,16 @@ def upload_index_files(): sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None file_path = request.form.get("file_path") - file_token_counts = request.form.get("file_token_counts") + directory_structure = request.form.get("directory_structure") - if file_token_counts: + if directory_structure: try: - file_token_counts = json.loads(file_token_counts) + directory_structure = json.loads(directory_structure) except: - logger.error("Error parsing file_token_counts") - file_token_counts = {} + logger.error("Error parsing directory_structure") + directory_structure = {} else: - file_token_counts = {} + directory_structure = {} storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -99,7 +99,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, - "file_token_counts": file_token_counts, + "directory_structure": directory_structure, } }, ) @@ -118,7 +118,7 @@ def upload_index_files(): "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, - "file_token_counts": file_token_counts, + "directory_structure": directory_structure, } ) return {"status": "ok"} diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 2851dcdd..85ed9404 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -142,7 +142,6 @@ class SimpleDirectoryReader(BaseReader): Returns: List[Document]: A list of documents. - """ data: Union[str, List[str]] = "" data_list: List[str] = [] @@ -188,7 +187,13 @@ class SimpleDirectoryReader(BaseReader): metadata_list.append(file_metadata) self.file_token_counts = file_token_counts - logging.info(f"File token counts: {file_token_counts}") + + # Build directory structure if input_dir is provided + if hasattr(self, 'input_dir'): + self.directory_structure = self._build_directory_structure(self.input_dir) + logging.info(f"Directory structure built successfully") + else: + self.directory_structure = {} if concatenate: return [Document("\n".join(data_list))] @@ -196,3 +201,39 @@ class SimpleDirectoryReader(BaseReader): return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list] + + def _build_directory_structure(self, base_path): + """Build a dictionary representing the directory structure. + + Args: + base_path: The base path to start building the structure from. + + Returns: + dict: A nested dictionary representing the directory structure. + """ + structure = {} + base_path = Path(base_path) + + def _build_tree(path, current_dict): + for item in path.iterdir(): + if item.is_dir(): + if self.exclude_hidden and item.name.startswith('.'): + continue + current_dict[item.name] = {} + _build_tree(item, current_dict[item.name]) + else: + if self.exclude_hidden and item.name.startswith('.'): + continue + if self.required_exts is not None and item.suffix not in self.required_exts: + continue + # Store file with its token count if available + if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts: + current_dict[item.name] = { + "type": "file", + "token_count": self.file_token_counts[item.name] + } + else: + current_dict[item.name] = {"type": "file"} + + _build_tree(base_path, structure) + return structure diff --git a/application/worker.py b/application/worker.py index 805fa0ed..8cc7ac20 100755 --- a/application/worker.py +++ b/application/worker.py @@ -259,9 +259,8 @@ def ingest_worker( ) raw_docs = reader.load_data() - file_token_counts = getattr(reader, 'file_token_counts', {}) - - logging.info(f"File token counts from reader: {file_token_counts}") + directory_structure = getattr(reader, 'directory_structure', {}) + logging.info(f"Directory structure from reader: {directory_structure}") chunker = Chunker( chunking_strategy="classic_chunk", @@ -288,15 +287,15 @@ def ingest_worker( for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { - "name": job_name, # Use original job_name + "name": job_name, "file": filename, - "user": user, # Use original user + "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": "local", "file_path": file_path, - "file_token_counts": json.dumps(file_token_counts), + "directory_structure": json.dumps(directory_structure), } upload_index(vector_store_path, file_data) From 82fc19e7b78d3c6206a8c246077be5670aca276a Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 3 Jul 2025 17:28:12 +0530 Subject: [PATCH 04/57] (fix:dir-reader) conflict of same filename in dir --- application/parser/file/bulk.py | 42 ++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 85ed9404..11e69f75 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -146,7 +146,7 @@ class SimpleDirectoryReader(BaseReader): data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] - file_token_counts = {} + self.file_token_counts = {} for input_file in self.input_files: if input_file.suffix in self.file_extractor: @@ -165,28 +165,34 @@ class SimpleDirectoryReader(BaseReader): else: file_tokens = num_tokens_from_string(str(data)) - file_token_counts[input_file.name] = file_tokens + full_path = str(input_file.resolve()) + self.file_token_counts[full_path] = file_tokens - # Prepare metadata for this file - if self.file_metadata is not None: - file_metadata = self.file_metadata(input_file.name) + base_metadata = { + 'title': input_file.name, + 'token_count': file_tokens, + } + + if hasattr(self, 'input_dir'): + try: + relative_path = str(input_file.relative_to(self.input_dir)) + base_metadata['source'] = relative_path + except ValueError: + base_metadata['source'] = str(input_file) else: - # Provide a default empty metadata - file_metadata = {'title': '', 'store': ''} - # TODO: Find a case with no metadata and check if breaks anything + base_metadata['source'] = str(input_file) + + if self.file_metadata is not None: + custom_metadata = self.file_metadata(input_file.name) + base_metadata.update(custom_metadata) if isinstance(data, List): # Extend data_list with each item in the data list data_list.extend([str(d) for d in data]) - # For each item in the data list, add the file's metadata to metadata_list - metadata_list.extend([file_metadata for _ in data]) + metadata_list.extend([base_metadata for _ in data]) else: - # Add the single piece of data to data_list data_list.append(str(data)) - # Add the file's metadata to metadata_list - metadata_list.append(file_metadata) - - self.file_token_counts = file_token_counts + metadata_list.append(base_metadata) # Build directory structure if input_dir is provided if hasattr(self, 'input_dir'): @@ -227,10 +233,12 @@ class SimpleDirectoryReader(BaseReader): if self.required_exts is not None and item.suffix not in self.required_exts: continue # Store file with its token count if available - if hasattr(self, 'file_token_counts') and item.name in self.file_token_counts: + + full_path = str(item.resolve()) + if hasattr(self, 'file_token_counts') and full_path in self.file_token_counts: current_dict[item.name] = { "type": "file", - "token_count": self.file_token_counts[item.name] + "token_count": self.file_token_counts[full_path] } else: current_dict[item.name] = {"type": "file"} From 0f7ebcd8e45f6db9735b6e2cde63ff99076cc59a Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 3 Jul 2025 18:09:19 +0530 Subject: [PATCH 05/57] (feat:dir-reader) store mime types, file size in db --- application/parser/file/bulk.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 11e69f75..6f8d6cde 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -232,16 +232,24 @@ class SimpleDirectoryReader(BaseReader): continue if self.required_exts is not None and item.suffix not in self.required_exts: continue - # Store file with its token count if available - + full_path = str(item.resolve()) + file_size_bytes = item.stat().st_size + + import mimetypes + mime_type = mimetypes.guess_type(item.name)[0] or "application/octet-stream" + if hasattr(self, 'file_token_counts') and full_path in self.file_token_counts: current_dict[item.name] = { - "type": "file", - "token_count": self.file_token_counts[full_path] + "type": mime_type, + "token_count": self.file_token_counts[full_path], + "size_bytes": file_size_bytes } else: - current_dict[item.name] = {"type": "file"} + current_dict[item.name] = { + "type": mime_type, + "size_bytes": file_size_bytes + } _build_tree(base_path, structure) return structure From bbce872ac5e65b8a032689ffb745a547157d4f3b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 4 Jul 2025 02:19:58 +0530 Subject: [PATCH 06/57] (fix:chunker) combine metadata as well --- application/parser/chunking.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/application/parser/chunking.py b/application/parser/chunking.py index aae14898..f2a69dac 100644 --- a/application/parser/chunking.py +++ b/application/parser/chunking.py @@ -35,11 +35,33 @@ class Chunker: def combine_documents(self, doc: Document, next_doc: Document) -> Document: combined_text = doc.text + " " + next_doc.text combined_token_count = len(self.encoding.encode(combined_text)) + + combined_extra_info = {**(doc.extra_info or {}), "token_count": combined_token_count} + + sources = [] + if doc.extra_info and 'source' in doc.extra_info: + sources.append(doc.extra_info['source']) + if next_doc.extra_info and 'source' in next_doc.extra_info: + sources.append(next_doc.extra_info['source']) + + if sources: + combined_extra_info['source'] = sources + + titles = [] + if doc.extra_info and 'title' in doc.extra_info: + titles.append(doc.extra_info['title']) + if next_doc.extra_info and 'title' in next_doc.extra_info: + titles.append(next_doc.extra_info['title']) + + # Store combined title + if titles: + combined_extra_info['title'] = ", ".join(titles) + new_doc = Document( text=combined_text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info={**(doc.extra_info or {}), "token_count": combined_token_count} + extra_info=combined_extra_info ) return new_doc From ca95d7275a8fb9852ad7f37ef5ae189d3bdf0527 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 8 Jul 2025 02:32:18 +0530 Subject: [PATCH 07/57] (feat:dateTimeUtils) localise weekday format --- frontend/src/utils/dateTimeUtils.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/frontend/src/utils/dateTimeUtils.ts b/frontend/src/utils/dateTimeUtils.ts index caddf961..acb998a9 100644 --- a/frontend/src/utils/dateTimeUtils.ts +++ b/frontend/src/utils/dateTimeUtils.ts @@ -34,6 +34,21 @@ export function formatDate(dateString: string): string { day: 'numeric', year: 'numeric', }); + } else if ( + /^[A-Za-z]{3}, \d{2} [A-Za-z]{3} \d{4} \d{2}:\d{2}:\d{2} GMT$/.test( + dateString, + ) + ) { + // Format: "Fri, 08 Jul 2025 06:00:00 GMT" + const dateTime = new Date(dateString); + return dateTime.toLocaleDateString('en-US', { + weekday: 'short', + month: 'short', + day: 'numeric', + year: 'numeric', + hour: '2-digit', + minute: '2-digit', + }); } else { return dateString; } From 4fcbdae5bf8820f559d362fa158eb9aedd46d0de Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 8 Jul 2025 02:46:34 +0530 Subject: [PATCH 08/57] (feat:docs) cards UI --- frontend/src/settings/Documents.tsx | 242 +++++++++++++--------------- 1 file changed, 109 insertions(+), 133 deletions(-) diff --git a/frontend/src/settings/Documents.tsx b/frontend/src/settings/Documents.tsx index 1e628250..63aea342 100644 --- a/frontend/src/settings/Documents.tsx +++ b/frontend/src/settings/Documents.tsx @@ -4,7 +4,6 @@ import { useDispatch, useSelector } from 'react-redux'; import userService from '../api/services/userService'; import ArrowLeft from '../assets/arrow-left.svg'; -import caretSort from '../assets/caret-sort.svg'; import Edit from '../assets/edit.svg'; import EyeView from '../assets/eye-view.svg'; import NoFilesDarkIcon from '../assets/no-files-dark.svg'; @@ -323,146 +322,123 @@ export default function Documents({
-
-
- - - - - - - - - - - {loading ? ( - - ) : !currentDocuments?.length ? ( - - - - ) : ( - currentDocuments.map((document, index) => { - const docId = document.id ? document.id.toString() : ''; + {loading ? ( + + ) : !currentDocuments?.length ? ( +
+ {t('settings.documents.noData')} +

+ {t('settings.documents.noData')} +

+
+ ) : ( +
+ {currentDocuments.map((document, index) => { + const docId = document.id ? document.id.toString() : ''; - return ( -
- - - - - - ); - }) - )} - -
- {t('settings.documents.name')} - -
- {t('settings.documents.date')} - refreshDocs('date')} - src={caretSort} - alt="sort" - /> -
-
-
- - {t('settings.documents.tokenUsage')} - - - {t('settings.documents.tokenUsage')} - - refreshDocs('tokens')} - src={caretSort} - alt="sort" - /> -
-
- {t('settings.documents.actions')} -
- {t('settings.documents.noData')} -
+
+
+
+

{document.name} -

- {document.date ? formatDate(document.date) : ''} - + +
+ {document.syncFrequency && ( + { + handleManageSync(document, value); + }} + defaultValue={document.syncFrequency} + icon={SyncIcon} + isOpen={ + syncMenuState.docId === docId && + syncMenuState.isOpen + } + onOpenChange={(isOpen) => { + setSyncMenuState((prev) => ({ + ...prev, + isOpen, + docId: isOpen ? docId : null, + document: isOpen ? document : null, + })); + }} + anchorRef={getMenuRef(docId)} + position="bottom-left" + offset={{ x: 24, y: -24 }} + className="min-w-[120px]" + /> + )} + +
+ + + +
+
+ {document.date ? formatDate(document.date) : ''} +
+
+ + {t('settings.documents.tokenUsage')}: + + {document.tokens ? formatTokens(+document.tokens) : ''} -
e.stopPropagation()} - > -
- {document.syncFrequency && ( - { - handleManageSync(document, value); - }} - defaultValue={document.syncFrequency} - icon={SyncIcon} - isOpen={ - syncMenuState.docId === docId && - syncMenuState.isOpen - } - onOpenChange={(isOpen) => { - setSyncMenuState((prev) => ({ - ...prev, - isOpen, - docId: isOpen ? docId : null, - document: isOpen ? document : null, - })); - }} - anchorRef={getMenuRef(docId)} - position="bottom-left" - offset={{ x: 24, y: -24 }} - className="min-w-[120px]" - /> - )} - - { - setActiveMenuId(isOpen ? docId : null); - }} - options={getActionOptions(index, document)} - anchorRef={getMenuRef(docId)} - position="bottom-left" - offset={{ x: 48, y: 0 }} - className="z-50" - /> -
-
+ +
+
+
+ { + setActiveMenuId(isOpen ? docId : null); + }} + options={getActionOptions(index, document)} + anchorRef={getMenuRef(docId)} + position="bottom-left" + offset={{ x: 48, y: 0 }} + className="z-50" + /> + + ); + })} - + )} From f60c516185149bac5ca4544c3621e61ff4a1916c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 11 Jul 2025 01:44:30 +0530 Subject: [PATCH 09/57] (feat:dir_tree) table with folder contents --- frontend/src/components/FileTreeComponent.tsx | 357 ++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 frontend/src/components/FileTreeComponent.tsx diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx new file mode 100644 index 00000000..b106c106 --- /dev/null +++ b/frontend/src/components/FileTreeComponent.tsx @@ -0,0 +1,357 @@ +import React, { useEffect, useState, useRef } from 'react'; +import { useSelector } from 'react-redux'; +import userService from '../api/services/userService'; +import FileIcon from '../assets/file.svg'; +import FolderIcon from '../assets/folder.svg'; +import ArrowLeft from '../assets/arrow-left.svg'; +import ThreeDots from '../assets/three-dots.svg'; +import EyeView from '../assets/eye-view.svg'; +import OutlineSource from '../assets/outline-source.svg'; +import Trash from '../assets/red-trash.svg'; +import Spinner from './Spinner'; +import { useTranslation } from 'react-i18next'; +import ContextMenu, { MenuOption } from './ContextMenu'; + +interface FileNode { + type?: string; + token_count?: number; + size_bytes?: number; + [key: string]: any; +} + +interface DirectoryStructure { + [key: string]: FileNode; +} + +interface FileTreeComponentProps { + docId: string; + sourceName: string; + onBackToDocuments?: () => void; +} + +const FileTreeComponent: React.FC = ({ + docId, + sourceName, + onBackToDocuments, +}) => { + const { t } = useTranslation(); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [directoryStructure, setDirectoryStructure] = + useState(null); + const [currentPath, setCurrentPath] = useState([]); + const token = useSelector((state: any) => state.auth?.token); + const [activeMenuId, setActiveMenuId] = useState(null); + const menuRefs = useRef<{ + [key: string]: React.RefObject; + }>({}); + + useEffect(() => { + const fetchDirectoryStructure = async () => { + try { + setLoading(true); + const response = await userService.getDirectoryStructure(docId, token); + const data = await response.json(); + + if (data && data.directory_structure) { + setDirectoryStructure(data.directory_structure); + } else { + setError('Invalid response format'); + } + } catch (err) { + setError('Failed to load directory structure'); + console.error(err); + } finally { + setLoading(false); + } + }; + + if (docId) { + fetchDirectoryStructure(); + } + }, [docId, token]); + + const formatBytes = (bytes: number): string => { + if (bytes === 0) return '0 Bytes'; + const k = 1024; + const sizes = ['Bytes', 'KB', 'MB', 'GB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; + }; + + const navigateToDirectory = (dirName: string) => { + setCurrentPath((prev) => [...prev, dirName]); + }; + + const navigateUp = () => { + setCurrentPath((prev) => prev.slice(0, -1)); + }; + + const getCurrentDirectory = (): DirectoryStructure => { + if (!directoryStructure) return {}; + + let current: any = directoryStructure; + for (const dir of currentPath) { + if (current[dir] && !current[dir].type) { + current = current[dir]; + } else { + return {}; + } + } + return current; + }; + + const handleBackNavigation = () => { + if (currentPath.length === 0) { + if (onBackToDocuments) { + onBackToDocuments(); + } + } else { + navigateUp(); + } + }; + + const getMenuRef = (itemId: string) => { + if (!menuRefs.current[itemId]) { + menuRefs.current[itemId] = React.createRef(); + } + return menuRefs.current[itemId]; + }; + + const handleMenuClick = (e: React.MouseEvent, itemId: string) => { + e.preventDefault(); + e.stopPropagation(); + + if (activeMenuId === itemId) { + setActiveMenuId(null); + return; + } + setActiveMenuId(itemId); + }; + + const getActionOptions = (name: string, isFile: boolean): MenuOption[] => { + const options: MenuOption[] = []; + + if (isFile) { + options.push({ + icon: EyeView, + label: t('settings.documents.view'), + onClick: (event: React.SyntheticEvent) => { + event.stopPropagation(); + console.log('View file:', name); + // View file action will be implemented later + }, + iconWidth: 18, + iconHeight: 18, + variant: 'primary', + }); + } + + options.push({ + icon: Trash, + label: t('convTile.delete'), + onClick: (event: React.SyntheticEvent) => { + event.stopPropagation(); + console.log('Delete item:', name); + // Delete action will be implemented later + }, + iconWidth: 18, + iconHeight: 18, + variant: 'danger', + }); + + return options; + }; + + const renderPathNavigation = () => { + return ( +
+ + +
+ source + {sourceName} + {currentPath.length > 0 && ( + <> + / + {currentPath.map((dir, index) => ( + + + {dir} + + {index < currentPath.length - 1 && ( + / + )} + + ))} + + )} +
+
+ ); + }; + const renderFileTree = (structure: DirectoryStructure): React.ReactNode[] => { + const entries = Object.entries(structure); + const directories = entries.filter(([_, node]) => !node.type); + const files = entries.filter(([_, node]) => node.type); + + return [ + ...directories.map(([name, node]) => { + const itemId = `dir-${name}`; + const menuRef = getMenuRef(itemId); + + return ( + + +
navigateToDirectory(name)} + > + Folder + {name} +
+ + - + - + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, false)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: 0, y: 8 }} + /> +
+ + + ); + }), + ...files.map(([name, node]) => { + const itemId = `file-${name}`; + const menuRef = getMenuRef(itemId); + + return ( + + +
+ File + {name} +
+ + + {node.token_count?.toLocaleString() || '-'} + + + {node.size_bytes ? formatBytes(node.size_bytes) : '-'} + + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, true)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: 0, y: 8 }} + /> +
+ + + ); + }), + ]; + }; + + if (loading) { + return ( +
+ +
+ ); + } + + if (error) { + return
{error}
; + } + + if (!directoryStructure) { + return ( +
+ No directory structure available +
+ ); + } + + const currentDirectory = getCurrentDirectory(); + + return ( +
+
{renderPathNavigation()}
+ +
+ + + + + + + + + + + {renderFileTree(currentDirectory)} + +
+ Name + + Tokens + + Size + + Actions +
+
+
+ ); +}; + +export default FileTreeComponent; From a24a3f868cc087c52ef3178661adab5d62093626 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 14 Jul 2025 22:56:03 +0530 Subject: [PATCH 10/57] (feat:dir-structure) adding route --- application/api/user/routes.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 71d23375..f51ecc10 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -7,6 +7,8 @@ import shutil import uuid from functools import wraps from typing import Optional, Tuple +import tempfile +import zipfile from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef @@ -3609,3 +3611,51 @@ class ServeImage(Resource): return make_response( jsonify({"success": False, "message": "Error retrieving image"}), 500 ) + + +@user_ns.route("/api/directory_structure") +class DirectoryStructure(Resource): + @api.doc( + description="Get the directory structure for a document", + params={"id": "The document ID"}, + ) + def get(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + + user = decoded_token.get("sub") + doc_id = request.args.get("id") + + if not doc_id: + return make_response( + jsonify({"error": "Document ID is required"}), 400 + ) + + if not ObjectId.is_valid(doc_id): + return make_response(jsonify({"error": "Invalid document ID"}), 400) + + try: + doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) + if not doc: + return make_response( + jsonify({"error": "Document not found or access denied"}), 404 + ) + + directory_structure = doc.get("directory_structure", {}) + + return make_response( + jsonify({ + "success": True, + "directory_structure": directory_structure, + "base_path": doc.get("file_path", "") + }), 200 + ) + + except Exception as e: + current_app.logger.error( + f"Error retrieving directory structure: {e}", exc_info=True + ) + return make_response( + jsonify({"success": False, "error": str(e)}), 500 + ) From a38d71bbfb0aa3b1df87f0a78f5c1c3b53ba0409 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 14 Jul 2025 22:57:29 +0530 Subject: [PATCH 11/57] (feat:get_chunks) filtered by relative path --- application/api/user/routes.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index f51ecc10..d7b923bf 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3322,8 +3322,13 @@ class DeleteTool(Resource): @user_ns.route("/api/get_chunks") class GetChunks(Resource): @api.doc( - description="Retrieves all chunks associated with a document", - params={"id": "The document ID"}, + description="Retrieves chunks from a document, optionally filtered by file path", + params={ + "id": "The document ID", + "page": "Page number for pagination", + "per_page": "Number of chunks per page", + "file_path": "Optional: Filter chunks by relative file path" + }, ) def get(self): decoded_token = request.decoded_token @@ -3333,6 +3338,7 @@ class GetChunks(Resource): doc_id = request.args.get("id") page = int(request.args.get("page", 1)) per_page = int(request.args.get("per_page", 10)) + file_path = request.args.get("file_path") if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) @@ -3344,6 +3350,22 @@ class GetChunks(Resource): try: store = get_vector_store(doc_id) chunks = store.get_chunks() + + if file_path: + filtered_chunks = [] + for chunk in chunks: + metadata = chunk.get("metadata", {}) + source = metadata.get("source", "") + + if isinstance(source, str) and source.endswith(file_path): + filtered_chunks.append(chunk) + elif isinstance(source, list): + for src in source: + if isinstance(src, str) and src.endswith(file_path): + filtered_chunks.append(chunk) + break + chunks = filtered_chunks + total_chunks = len(chunks) start = (page - 1) * per_page end = start + per_page @@ -3356,6 +3378,7 @@ class GetChunks(Resource): "per_page": per_page, "total": total_chunks, "chunks": paginated_chunks, + "file_path": file_path if file_path else None } ), 200, From 5b07c5f2e84faca36ba520fd80241ba5743772b2 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 15 Jul 2025 15:31:26 +0530 Subject: [PATCH 12/57] (feat:ingestion) unzip, extract and store --- application/api/user/routes.py | 190 ++++++++++----------------------- application/worker.py | 54 +++++++--- 2 files changed, 95 insertions(+), 149 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index d7b923bf..68936dcc 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -548,138 +548,60 @@ class UploadFile(Resource): # Create safe versions for filesystem operations safe_user = safe_filename(user) dir_name = safe_filename(job_name) + base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" try: storage = StorageCreator.get_storage() - base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" - - if len(files) > 1: - temp_files = [] - for file in files: - original_filename = file.filename - safe_file = safe_filename(original_filename) - temp_path = f"{base_path}/temp/{safe_file}" - storage.save_file(file, temp_path) - temp_files.append({"path": temp_path, "original_name": original_filename}) - print(f"Saved file: {original_filename}") - zip_filename = f"{dir_name}.zip" - zip_path = f"{base_path}/{zip_filename}" - zip_temp_path = None - - def create_zip_archive(temp_paths, dir_name, storage): - import tempfile - - with tempfile.NamedTemporaryFile( - delete=False, suffix=".zip" - ) as temp_zip_file: - zip_output_path = temp_zip_file.name - with tempfile.TemporaryDirectory() as stage_dir: - for path in temp_paths: - try: - file_data = storage.get_file(path) - with open( - os.path.join(stage_dir, os.path.basename(path)), - "wb", - ) as f: - f.write(file_data.read()) - except Exception as e: - current_app.logger.error( - f"Error processing file {path} for zipping: {e}", - exc_info=True, - ) - if os.path.exists(zip_output_path): - os.remove(zip_output_path) - raise - try: - shutil.make_archive( - base_name=zip_output_path.replace(".zip", ""), - format="zip", - root_dir=stage_dir, - ) - except Exception as e: - current_app.logger.error( - f"Error creating zip archive: {e}", exc_info=True - ) - if os.path.exists(zip_output_path): - os.remove(zip_output_path) - raise - return zip_output_path - - try: - zip_temp_path = create_zip_archive(temp_files, dir_name, storage) - with open(zip_temp_path, "rb") as zip_file: - storage.save_file(zip_file, zip_path) - task = ingest.delay( - settings.UPLOAD_FOLDER, - [ - ".rst", - ".md", - ".pdf", - ".txt", - ".docx", - ".csv", - ".epub", - ".html", - ".mdx", - ".json", - ".xlsx", - ".pptx", - ".png", - ".jpg", - ".jpeg", - ], - job_name, - user, - file_path=zip_path, - filename=zip_filename - ) - finally: - # Clean up temporary files - for temp_path in temp_files: - try: - storage.delete_file(temp_path) - except Exception as e: - current_app.logger.error( - f"Error deleting temporary file {temp_path}: {e}", - exc_info=True, - ) - # Clean up the zip file if it was created - if zip_temp_path and os.path.exists(zip_temp_path): - os.remove(zip_temp_path) - else: # Keep this else block for single file upload - # For single file - - file = files[0] + + + for file in files: original_filename = file.filename safe_file = safe_filename(original_filename) - file_path = f"{base_path}/{safe_file}" - - storage.save_file(file, file_path) - - task = ingest.delay( - settings.UPLOAD_FOLDER, - [ - ".rst", - ".md", - ".pdf", - ".txt", - ".docx", - ".csv", - ".epub", - ".html", - ".mdx", - ".json", - ".xlsx", - ".pptx", - ".png", - ".jpg", - ".jpeg", - ], - job_name, - user, - file_path=file_path, - filename=original_filename - ) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = os.path.join(temp_dir, safe_file) + file.save(temp_file_path) + + if zipfile.is_zipfile(temp_file_path): + try: + with zipfile.ZipFile(temp_file_path, 'r') as zip_ref: + zip_ref.extractall(path=temp_dir) + + # Walk through extracted files and upload them + for root, _, files in os.walk(temp_dir): + for extracted_file in files: + if os.path.join(root, extracted_file) == temp_file_path: + continue + + rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir) + storage_path = f"{base_path}/{rel_path}" + + with open(os.path.join(root, extracted_file), 'rb') as f: + storage.save_file(f, storage_path) + except Exception as e: + current_app.logger.error(f"Error extracting zip: {e}", exc_info=True) + # If zip extraction fails, save the original zip file + file_path = f"{base_path}/{safe_file}" + with open(temp_file_path, 'rb') as f: + storage.save_file(f, file_path) + else: + # For non-zip files, save directly + file_path = f"{base_path}/{safe_file}" + with open(temp_file_path, 'rb') as f: + storage.save_file(f, file_path) + + task = ingest.delay( + settings.UPLOAD_FOLDER, + [ + ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", + ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", + ".jpg", ".jpeg", + ], + job_name, + user, + file_path=base_path, + filename=dir_name + ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) @@ -831,6 +753,7 @@ class PaginatedSources(Resource): "tokens": doc.get("tokens", ""), "retriever": doc.get("retriever", "classic"), "syncFrequency": doc.get("sync_frequency", ""), + "isNested": bool(doc.get("directory_structure")) } paginated_docs.append(doc_data) response = { @@ -878,6 +801,7 @@ class CombinedJson(Resource): "tokens": index.get("tokens", ""), "retriever": index.get("retriever", "classic"), "syncFrequency": index.get("sync_frequency", ""), + "is_nested": bool(index.get("directory_structure")) } ) except Exception as err: @@ -3327,7 +3251,7 @@ class GetChunks(Resource): "id": "The document ID", "page": "Page number for pagination", "per_page": "Number of chunks per page", - "file_path": "Optional: Filter chunks by relative file path" + "path": "Optional: Filter chunks by relative file path" }, ) def get(self): @@ -3338,7 +3262,7 @@ class GetChunks(Resource): doc_id = request.args.get("id") page = int(request.args.get("page", 1)) per_page = int(request.args.get("per_page", 10)) - file_path = request.args.get("file_path") + path = request.args.get("path") if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) @@ -3351,17 +3275,17 @@ class GetChunks(Resource): store = get_vector_store(doc_id) chunks = store.get_chunks() - if file_path: + if path: filtered_chunks = [] for chunk in chunks: metadata = chunk.get("metadata", {}) source = metadata.get("source", "") - if isinstance(source, str) and source.endswith(file_path): + if isinstance(source, str) and source.endswith(path): filtered_chunks.append(chunk) elif isinstance(source, list): for src in source: - if isinstance(src, str) and src.endswith(file_path): + if isinstance(src, str) and src.endswith(path): filtered_chunks.append(chunk) break chunks = filtered_chunks @@ -3378,7 +3302,7 @@ class GetChunks(Resource): "per_page": per_page, "total": total_chunks, "chunks": paginated_chunks, - "file_path": file_path if file_path else None + "path": path if path else None } ), 200, diff --git a/application/worker.py b/application/worker.py index 8cc7ac20..6e3cb1ae 100755 --- a/application/worker.py +++ b/application/worker.py @@ -221,31 +221,53 @@ def ingest_worker( storage = StorageCreator.get_storage() - logging.info(f"Ingest file: {file_path}", extra={"user": user, "job": job_name}) + logging.info(f"Ingest path: {file_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: try: os.makedirs(temp_dir, exist_ok=True) - # Download file from storage to temp directory - temp_filename = os.path.basename(file_path) - temp_file_path = os.path.join(temp_dir, temp_filename) - - file_data = storage.get_file(file_path) + if storage.is_directory(file_path): + # Handle directory case + logging.info(f"Processing directory: {file_path}") + files_list = storage.list_files(file_path) + + for storage_file_path in files_list: + if storage.is_directory(storage_file_path): + continue + + # Create relative path structure in temp directory + rel_path = os.path.relpath(storage_file_path, file_path) + local_file_path = os.path.join(temp_dir, rel_path) + + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download file + try: + file_data = storage.get_file(storage_file_path) + with open(local_file_path, "wb") as f: + f.write(file_data.read()) + except Exception as e: + logging.error(f"Error downloading file {storage_file_path}: {e}") + continue + else: + # Handle single file case + temp_filename = os.path.basename(file_path) + temp_file_path = os.path.join(temp_dir, temp_filename) + + file_data = storage.get_file(file_path) + with open(temp_file_path, "wb") as f: + f.write(file_data.read()) - with open(temp_file_path, "wb") as f: - f.write(file_data.read()) + # Handle zip files + if temp_filename.endswith(".zip"): + logging.info(f"Extracting zip file: {temp_filename}") + extract_zip_recursive( + temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH + ) self.update_state(state="PROGRESS", meta={"current": 1}) - - # Handle zip files - if temp_filename.endswith(".zip"): - logging.info(f"Extracting zip file: {temp_filename}") - extract_zip_recursive( - temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH - ) - if sample: logging.info(f"Sample mode enabled. Using {limit} documents.") From 1d9af05e9ecb3442bb2ef892e6db38f4c6f2aba3 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 15 Jul 2025 15:34:16 +0530 Subject: [PATCH 13/57] (feat:storage) is dir fnc --- application/storage/base.py | 13 +++++++++++++ application/storage/local.py | 13 +++++++++++++ application/storage/s3.py | 25 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/application/storage/base.py b/application/storage/base.py index 07f33c7b..50eaac7e 100644 --- a/application/storage/base.py +++ b/application/storage/base.py @@ -93,3 +93,16 @@ class BaseStorage(ABC): List[str]: List of file paths """ pass + + @abstractmethod + def is_directory(self, path: str) -> bool: + """ + Check if a path is a directory. + + Args: + path: Path to check + + Returns: + bool: True if the path is a directory + """ + pass diff --git a/application/storage/local.py b/application/storage/local.py index fb21f08d..b4530501 100644 --- a/application/storage/local.py +++ b/application/storage/local.py @@ -101,3 +101,16 @@ class LocalStorage(BaseStorage): raise FileNotFoundError(f"File not found: {full_path}") return processor_func(local_path=full_path, **kwargs) + + def is_directory(self, path: str) -> bool: + """ + Check if a path is a directory in local storage. + + Args: + path: Path to check + + Returns: + bool: True if the path is a directory, False otherwise + """ + full_path = self._get_full_path(path) + return os.path.isdir(full_path) diff --git a/application/storage/s3.py b/application/storage/s3.py index 1babb843..36333f1c 100644 --- a/application/storage/s3.py +++ b/application/storage/s3.py @@ -130,3 +130,28 @@ class S3Storage(BaseStorage): except Exception as e: logging.error(f"Error processing S3 file {path}: {e}", exc_info=True) raise + + def is_directory(self, path: str) -> bool: + """ + Check if a path is a directory in S3 storage. + + In S3, directories are virtual concepts. A path is considered a directory + if there are objects with the path as a prefix. + + Args: + path: Path to check + + Returns: + bool: True if the path is a directory, False otherwise + """ + # Ensure path ends with a slash if not empty + if path and not path.endswith('/'): + path += '/' + + response = self.s3.list_objects_v2( + Bucket=self.bucket_name, + Prefix=path, + MaxKeys=1 + ) + + return 'Contents' in response From 8a7806ab2dcfd265fa22fa5d1513d6dc2ac500e3 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 15 Jul 2025 19:15:40 +0530 Subject: [PATCH 14/57] (feat:nested source view) file tree, chunks display --- frontend/src/api/endpoints.ts | 11 +- frontend/src/api/services/userService.ts | 5 +- frontend/src/assets/file.svg | 3 + frontend/src/assets/folder.svg | 3 + frontend/src/assets/outline-source.svg | 3 + frontend/src/components/DocumentChunks.tsx | 273 +++++++++++++++ frontend/src/components/FileTreeComponent.tsx | 199 +++++++---- frontend/src/models/misc.ts | 1 + frontend/src/settings/Documents.tsx | 312 ++---------------- 9 files changed, 452 insertions(+), 358 deletions(-) create mode 100644 frontend/src/assets/file.svg create mode 100644 frontend/src/assets/folder.svg create mode 100644 frontend/src/assets/outline-source.svg create mode 100644 frontend/src/components/DocumentChunks.tsx diff --git a/frontend/src/api/endpoints.ts b/frontend/src/api/endpoints.ts index bb98f02a..37c52c41 100644 --- a/frontend/src/api/endpoints.ts +++ b/frontend/src/api/endpoints.ts @@ -38,13 +38,20 @@ const endpoints = { UPDATE_TOOL_STATUS: '/api/update_tool_status', UPDATE_TOOL: '/api/update_tool', DELETE_TOOL: '/api/delete_tool', - GET_CHUNKS: (docId: string, page: number, per_page: number) => - `/api/get_chunks?id=${docId}&page=${page}&per_page=${per_page}`, + GET_CHUNKS: ( + docId: string, + page: number, + per_page: number, + path?: string, + ) => + `/api/get_chunks?id=${docId}&page=${page}&per_page=${per_page}${path ? `&path=${encodeURIComponent(path)}` : ''}`, ADD_CHUNK: '/api/add_chunk', DELETE_CHUNK: (docId: string, chunkId: string) => `/api/delete_chunk?id=${docId}&chunk_id=${chunkId}`, UPDATE_CHUNK: '/api/update_chunk', STORE_ATTACHMENT: '/api/store_attachment', + DIRECTORY_STRUCTURE: (docId: string) => + `/api/directory_structure?id=${docId}`, }, CONVERSATION: { ANSWER: '/api/answer', diff --git a/frontend/src/api/services/userService.ts b/frontend/src/api/services/userService.ts index ffb00a6b..43671657 100644 --- a/frontend/src/api/services/userService.ts +++ b/frontend/src/api/services/userService.ts @@ -86,8 +86,9 @@ const userService = { page: number, perPage: number, token: string | null, + path?: string, ): Promise => - apiClient.get(endpoints.USER.GET_CHUNKS(docId, page, perPage), token), + apiClient.get(endpoints.USER.GET_CHUNKS(docId, page, perPage, path), token), addChunk: (data: any, token: string | null): Promise => apiClient.post(endpoints.USER.ADD_CHUNK, data, token), deleteChunk: ( @@ -98,6 +99,8 @@ const userService = { apiClient.delete(endpoints.USER.DELETE_CHUNK(docId, chunkId), token), updateChunk: (data: any, token: string | null): Promise => apiClient.put(endpoints.USER.UPDATE_CHUNK, data, token), + getDirectoryStructure: (docId: string, token: string | null): Promise => + apiClient.get(endpoints.USER.DIRECTORY_STRUCTURE(docId), token), }; export default userService; diff --git a/frontend/src/assets/file.svg b/frontend/src/assets/file.svg new file mode 100644 index 00000000..7120a3c9 --- /dev/null +++ b/frontend/src/assets/file.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/assets/folder.svg b/frontend/src/assets/folder.svg new file mode 100644 index 00000000..2c2217f0 --- /dev/null +++ b/frontend/src/assets/folder.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/assets/outline-source.svg b/frontend/src/assets/outline-source.svg new file mode 100644 index 00000000..36b3aa6e --- /dev/null +++ b/frontend/src/assets/outline-source.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx new file mode 100644 index 00000000..94307bc0 --- /dev/null +++ b/frontend/src/components/DocumentChunks.tsx @@ -0,0 +1,273 @@ +import React, { useState, useEffect } from 'react'; +import { useSelector } from 'react-redux'; +import { useTranslation } from 'react-i18next'; +import { selectToken } from '../preferences/preferenceSlice'; +import { useDarkTheme, useLoaderState } from '../hooks'; +import userService from '../api/services/userService'; +import ArrowLeft from '../assets/arrow-left.svg'; +import NoFilesIcon from '../assets/no-files.svg'; +import NoFilesDarkIcon from '../assets/no-files-dark.svg'; +import Spinner from '../components/Spinner'; +import Input from '../components/Input'; +import ChunkModal from '../modals/ChunkModal'; +import { ActiveState } from '../models/misc'; +import { ChunkType } from '../settings/types'; + +interface DocumentChunksProps { + documentId: string; + documentName?: string; + handleGoBack: () => void; + showHeader?: boolean; + path?: string; +} + +const DocumentChunks: React.FC = ({ + documentId, + documentName, + handleGoBack, + showHeader = true, + path, +}) => { + const { t } = useTranslation(); + const token = useSelector(selectToken); + const [isDarkTheme] = useDarkTheme(); + const [paginatedChunks, setPaginatedChunks] = useState([]); + const [page, setPage] = useState(1); + const [perPage, setPerPage] = useState(5); + const [totalChunks, setTotalChunks] = useState(0); + const [loading, setLoading] = useLoaderState(true); + const [searchTerm, setSearchTerm] = useState(''); + const [addModal, setAddModal] = useState('INACTIVE'); + const [editModal, setEditModal] = useState<{ + state: ActiveState; + chunk: ChunkType | null; + }>({ state: 'INACTIVE', chunk: null }); + + const fetchChunks = () => { + setLoading(true); + try { + userService + .getDocumentChunks(documentId, page, perPage, token, path) + .then((response) => { + if (!response.ok) { + setLoading(false); + setPaginatedChunks([]); + throw new Error('Failed to fetch chunks data'); + } + return response.json(); + }) + .then((data) => { + setPage(data.page); + setPerPage(data.per_page); + setTotalChunks(data.total); + setPaginatedChunks(data.chunks); + setLoading(false); + }); + } catch (e) { + console.log(e); + setLoading(false); + } + }; + + const handleAddChunk = (title: string, text: string) => { + try { + userService + .addChunk( + { + id: documentId, + text: text, + metadata: { + title: title, + }, + }, + token, + ) + .then((response) => { + if (!response.ok) { + throw new Error('Failed to add chunk'); + } + fetchChunks(); + }); + } catch (e) { + console.log(e); + } + }; + + const handleUpdateChunk = (title: string, text: string, chunk: ChunkType) => { + try { + userService + .updateChunk( + { + id: documentId, + chunk_id: chunk.doc_id, + text: text, + metadata: { + title: title, + }, + }, + token, + ) + .then((response) => { + if (!response.ok) { + throw new Error('Failed to update chunk'); + } + fetchChunks(); + }); + } catch (e) { + console.log(e); + } + }; + + const handleDeleteChunk = (chunk: ChunkType) => { + try { + userService + .deleteChunk(documentId, chunk.doc_id, token) + .then((response) => { + if (!response.ok) { + throw new Error('Failed to delete chunk'); + } + setEditModal({ state: 'INACTIVE', chunk: null }); + fetchChunks(); + }); + } catch (e) { + console.log(e); + } + }; + + useEffect(() => { + fetchChunks(); + }, [page, perPage]); + + return ( +
+ {showHeader && ( +
+ +

{t('settings.documents.backToAll')}

+
+ )} +
+
+

{`${totalChunks} ${t('settings.documents.chunks')}`}

+ + { + setSearchTerm(e.target.value); + }} + borderVariant="thin" + /> +
+ +
+ {loading ? ( +
+
+ +
+
+ ) : ( +
+ {paginatedChunks.filter((chunk) => { + if (!chunk.metadata?.title) return true; + return chunk.metadata.title + .toLowerCase() + .includes(searchTerm.toLowerCase()); + }).length === 0 ? ( +
+ {t('settings.documents.noChunksAlt')} + {t('settings.documents.noChunks')} +
+ ) : ( + paginatedChunks + .filter((chunk) => { + if (!chunk.metadata?.title) return true; + return chunk.metadata.title + .toLowerCase() + .includes(searchTerm.toLowerCase()); + }) + .map((chunk, index) => ( +
+
+
+ +
+
+

+ {chunk.text} +

+
+
+
+ )) + )} +
+ )} + + + {editModal.chunk && ( + + setEditModal((prev) => ({ ...prev, state })) + } + handleSubmit={(title, text) => { + handleUpdateChunk(title, text, editModal.chunk as ChunkType); + }} + originalText={editModal.chunk?.text ?? ''} + originalTitle={editModal.chunk?.metadata?.title ?? ''} + handleDelete={() => { + handleDeleteChunk(editModal.chunk as ChunkType); + }} + /> + )} +
+ ); +}; + +export default DocumentChunks; diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index b106c106..d4252064 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -1,5 +1,8 @@ -import React, { useEffect, useState, useRef } from 'react'; +import React, { useState, useRef, useEffect } from 'react'; +import { useTranslation } from 'react-i18next'; import { useSelector } from 'react-redux'; +import DocumentChunks from './DocumentChunks'; +import ContextMenu, { MenuOption } from './ContextMenu'; import userService from '../api/services/userService'; import FileIcon from '../assets/file.svg'; import FolderIcon from '../assets/folder.svg'; @@ -8,9 +11,6 @@ import ThreeDots from '../assets/three-dots.svg'; import EyeView from '../assets/eye-view.svg'; import OutlineSource from '../assets/outline-source.svg'; import Trash from '../assets/red-trash.svg'; -import Spinner from './Spinner'; -import { useTranslation } from 'react-i18next'; -import ContextMenu, { MenuOption } from './ContextMenu'; interface FileNode { type?: string; @@ -45,6 +45,18 @@ const FileTreeComponent: React.FC = ({ const menuRefs = useRef<{ [key: string]: React.RefObject; }>({}); + const [selectedFile, setSelectedFile] = useState<{ + id: string; + name: string; + } | null>(null); + + const handleFileClick = (fileName: string) => { + const fullPath = [...currentPath, fileName].join('/'); + setSelectedFile({ + id: fullPath, + name: fileName, + }); + }; useEffect(() => { const fetchDirectoryStructure = async () => { @@ -129,7 +141,11 @@ const FileTreeComponent: React.FC = ({ setActiveMenuId(itemId); }; - const getActionOptions = (name: string, isFile: boolean): MenuOption[] => { + const getActionOptions = ( + name: string, + isFile: boolean, + itemId: string, + ): MenuOption[] => { const options: MenuOption[] = []; if (isFile) { @@ -138,8 +154,7 @@ const FileTreeComponent: React.FC = ({ label: t('settings.documents.view'), onClick: (event: React.SyntheticEvent) => { event.stopPropagation(); - console.log('View file:', name); - // View file action will be implemented later + handleFileClick(name); }, iconWidth: 18, iconHeight: 18, @@ -195,32 +210,89 @@ const FileTreeComponent: React.FC = ({ ); }; + + const calculateDirectoryStats = ( + structure: DirectoryStructure, + ): { totalSize: number; totalTokens: number } => { + let totalSize = 0; + let totalTokens = 0; + + Object.entries(structure).forEach(([_, node]) => { + if (node.type) { + // It's a file + totalSize += node.size_bytes || 0; + totalTokens += node.token_count || 0; + } else { + // It's a directory, recurse + const stats = calculateDirectoryStats(node); + totalSize += stats.totalSize; + totalTokens += stats.totalTokens; + } + }); + + return { totalSize, totalTokens }; + }; + const renderFileTree = (structure: DirectoryStructure): React.ReactNode[] => { + // Separate directories and files const entries = Object.entries(structure); const directories = entries.filter(([_, node]) => !node.type); const files = entries.filter(([_, node]) => node.type); + // Create parent directory row + const parentRow = + currentPath.length > 0 + ? [ + + +
+ Parent folder + .. +
+ + - + - + + , + ] + : []; + + // Render directories first, then files return [ + ...parentRow, ...directories.map(([name, node]) => { const itemId = `dir-${name}`; const menuRef = getMenuRef(itemId); + const dirStats = calculateDirectoryStats(node as DirectoryStructure); return ( navigateToDirectory(name)} > -
navigateToDirectory(name)} - > +
Folder - {name} + {name}
- - - - + + {dirStats.totalTokens > 0 + ? dirStats.totalTokens.toLocaleString() + : '-'} + + + {dirStats.totalSize > 0 ? formatBytes(dirStats.totalSize) : '-'} +
-

{t('settings.documents.backToAll')}

-
-
-
-

{`${totalChunks} ${t('settings.documents.chunks')}`}

- - { - setSearchTerm(e.target.value); - }} - borderVariant="thin" - /> -
- -
- {loading ? ( -
-
- -
-
- ) : ( -
- {paginatedChunks.filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }).length === 0 ? ( -
- {t('settings.documents.noChunksAlt')} - {t('settings.documents.noChunks')} -
- ) : ( - paginatedChunks - .filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }) - .map((chunk, index) => ( -
-
-
- -
-
-

- {chunk.metadata?.title ?? 'Untitled'} -

-

- {chunk.text} -

-
-
-
- )) - )} -
- )} - {!loading && - paginatedChunks.filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }).length !== 0 && ( -
- { - setPage(page); - }} - onRowsPerPageChange={(rows) => { - setPerPage(rows); - setPage(1); - }} - /> -
- )} - - {editModal.chunk && ( - - setEditModal((prev) => ({ ...prev, state })) - } - handleSubmit={(title, text) => { - handleUpdateChunk(title, text, editModal.chunk as ChunkType); - }} - originalText={editModal.chunk?.text ?? ''} - originalTitle={editModal.chunk?.metadata?.title ?? ''} - handleDelete={() => { - handleDeleteChunk(editModal.chunk as ChunkType); - }} - /> - )} -
- ); -} From 99a8962183e33c3041b7d5a7b080321b9b393c32 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 17 Jul 2025 01:05:24 +0530 Subject: [PATCH 15/57] (fix/docs) menu event capture --- frontend/src/settings/Documents.tsx | 44 +++++++++-------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/frontend/src/settings/Documents.tsx b/frontend/src/settings/Documents.tsx index 14ab2e87..9a8e4195 100644 --- a/frontend/src/settings/Documents.tsx +++ b/frontend/src/settings/Documents.tsx @@ -1,3 +1,4 @@ + import React, { useCallback, useEffect, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; @@ -94,23 +95,6 @@ export default function Documents({ setActiveMenuId(docId); }; - useEffect(() => { - const handleClickOutside = (event: MouseEvent) => { - if (activeMenuId) { - const activeRef = menuRefs.current[activeMenuId]; - if ( - activeRef?.current && - !activeRef.current.contains(event.target as Node) - ) { - setActiveMenuId(null); - } - } - }; - - document.addEventListener('mousedown', handleClickOutside); - return () => document.removeEventListener('mousedown', handleClickOutside); - }, [activeMenuId]); - const currentDocuments = paginatedDocuments ?? []; const syncOptions = [ { label: t('settings.documents.syncFrequency.never'), value: 'never' }, @@ -288,7 +272,7 @@ export default function Documents({ documentId={documentToView.id || ''} documentName={documentToView.name} handleGoBack={() => setDocumentToView(undefined)} - showHeader={false} + showHeader={true} /> )} @@ -345,19 +329,19 @@ export default function Documents({

) : ( -
- {currentDocuments.map((document, index) => { - const docId = document.id ? document.id.toString() : ''; +
+ {currentDocuments.map((document, index) => { + const docId = document.id ? document.id.toString() : ''; - return ( -
-
+ return ( +
+

Date: Thu, 17 Jul 2025 02:16:40 +0530 Subject: [PATCH 16/57] (feat:chunks) redesigned --- frontend/src/components/DocumentChunks.tsx | 172 +++++++++++++-------- 1 file changed, 104 insertions(+), 68 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index 94307bc0..f619afb0 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -7,11 +7,13 @@ import userService from '../api/services/userService'; import ArrowLeft from '../assets/arrow-left.svg'; import NoFilesIcon from '../assets/no-files.svg'; import NoFilesDarkIcon from '../assets/no-files-dark.svg'; +import OutlineSource from '../assets/outline-source.svg'; import Spinner from '../components/Spinner'; -import Input from '../components/Input'; import ChunkModal from '../modals/ChunkModal'; import { ActiveState } from '../models/misc'; import { ChunkType } from '../settings/types'; +import EditIcon from '../assets/edit.svg'; +import Pagination from './DocumentPagination'; interface DocumentChunksProps { documentId: string; @@ -43,6 +45,8 @@ const DocumentChunks: React.FC = ({ chunk: ChunkType | null; }>({ state: 'INACTIVE', chunk: null }); + const pathParts = path ? path.split('/') : []; + const fetchChunks = () => { setLoading(true); try { @@ -138,40 +142,71 @@ const DocumentChunks: React.FC = ({ fetchChunks(); }, [page, perPage]); + const filteredChunks = paginatedChunks.filter((chunk) => { + if (!chunk.metadata?.title) return true; + return chunk.metadata.title + .toLowerCase() + .includes(searchTerm.toLowerCase()); + }); + return (
- {showHeader && ( -
+
+ {showHeader && ( -

{t('settings.documents.backToAll')}

+ )} + +
+ source + + {documentName} + + + {pathParts.length > 0 && ( + <> + / + {pathParts.map((part, index) => ( + + + {part} + + {index < pathParts.length - 1 && ( + / + )} + + ))} + + )}
- )} -
-
-

{`${totalChunks} ${t('settings.documents.chunks')}`}

- - { - setSearchTerm(e.target.value); - }} - borderVariant="thin" - /> +
+ +
+
+
+ {totalChunks > 999999 + ? `${(totalChunks / 1000000).toFixed(2)}M` + : totalChunks > 999 + ? `${(totalChunks / 1000).toFixed(2)}K` + : totalChunks} {t('settings.documents.chunks')} +
+
+
+ setSearchTerm(e.target.value)} + className="w-full h-full px-3 py-2 bg-transparent border-none outline-none font-normal text-[13.56px] leading-[100%] dark:text-[#E0E0E0]" + /> +
) : ( -
- {paginatedChunks.filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }).length === 0 ? ( -
+
+ {filteredChunks.length === 0 ? ( +
{t('settings.documents.noChunksAlt')} = ({ {t('settings.documents.noChunks')}
) : ( - paginatedChunks - .filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }) - .map((chunk, index) => ( -
-
-
- -
-
-

- {chunk.text} -

+ filteredChunks.map((chunk, index) => ( +
+
+
+
+ {chunk.metadata.token_count ? chunk.metadata.token_count.toLocaleString() : '-'} tokens
+ +
+
+

+ {chunk.text} +

- )) +
+ )) )}
)} + {!loading && filteredChunks.length > 0 && ( + { + setPerPage(rows); + setPage(1); + }} + /> + )} + Date: Thu, 17 Jul 2025 03:08:01 +0530 Subject: [PATCH 17/57] (feat:chunks) use common header, navigate --- frontend/src/components/DocumentChunks.tsx | 48 +++++++-------- frontend/src/components/FileTreeComponent.tsx | 59 +++++++++++-------- 2 files changed, 59 insertions(+), 48 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index f619afb0..90718473 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -151,39 +151,39 @@ const DocumentChunks: React.FC = ({ return (
-
- {showHeader && ( + {showHeader && ( +
- )} -
- source - - {documentName} - +
+ source + + {documentName} + - {pathParts.length > 0 && ( - <> - / - {pathParts.map((part, index) => ( - - - {part} - - {index < pathParts.length - 1 && ( - / - )} - - ))} - - )} + {pathParts.length > 0 && ( + <> + / + {pathParts.map((part, index) => ( + + + {part} + + {index < pathParts.length - 1 && ( + / + )} + + ))} + + )} +
-
+ )}
diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index d4252064..170c4cb9 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -114,7 +114,9 @@ const FileTreeComponent: React.FC = ({ }; const handleBackNavigation = () => { - if (currentPath.length === 0) { + if (selectedFile) { + setSelectedFile(null); + } else if (currentPath.length === 0) { if (onBackToDocuments) { onBackToDocuments(); } @@ -187,7 +189,7 @@ const FileTreeComponent: React.FC = ({ > left-arrow - +
source {sourceName} @@ -206,6 +208,14 @@ const FileTreeComponent: React.FC = ({ ))} )} + {selectedFile && ( + <> + / + + {selectedFile.name} + + + )}
); @@ -243,26 +253,26 @@ const FileTreeComponent: React.FC = ({ const parentRow = currentPath.length > 0 ? [ - - -
- Parent folder - .. -
- - - - - - - , - ] + + +
+ Parent folder + .. +
+ + - + - + + , + ] : []; // Render directories first, then files @@ -377,16 +387,17 @@ const FileTreeComponent: React.FC = ({ return ( <> +
{renderPathNavigation()}
{selectedFile ? ( setSelectedFile(null)} path={selectedFile.id} + showHeader={false} /> ) : (
-
{renderPathNavigation()}
From f336d445957d04b64c351213ea90fb08add0cb77 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 18 Jul 2025 15:03:23 +0530 Subject: [PATCH 18/57] (feat:chunks) search in dir --- frontend/src/components/DocumentChunks.tsx | 8 +- frontend/src/components/FileTreeComponent.tsx | 124 ++++++++++++++++-- 2 files changed, 119 insertions(+), 13 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index 90718473..f994b6f7 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -140,7 +140,7 @@ const DocumentChunks: React.FC = ({ useEffect(() => { fetchChunks(); - }, [page, perPage]); + }, [page, perPage, path]); const filteredChunks = paginatedChunks.filter((chunk) => { if (!chunk.metadata?.title) return true; @@ -150,7 +150,7 @@ const DocumentChunks: React.FC = ({ }); return ( -
+
{showHeader && (
)} -
+
{totalChunks > 999999 @@ -234,7 +234,7 @@ const DocumentChunks: React.FC = ({ filteredChunks.map((chunk, index) => (
diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index 170c4cb9..6a61ca12 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -11,6 +11,7 @@ import ThreeDots from '../assets/three-dots.svg'; import EyeView from '../assets/eye-view.svg'; import OutlineSource from '../assets/outline-source.svg'; import Trash from '../assets/red-trash.svg'; +import SearchIcon from '../assets/search.svg'; interface FileNode { type?: string; @@ -29,6 +30,12 @@ interface FileTreeComponentProps { onBackToDocuments?: () => void; } +interface SearchResult { + name: string; + path: string; + isFile: boolean; +} + const FileTreeComponent: React.FC = ({ docId, sourceName, @@ -49,6 +56,8 @@ const FileTreeComponent: React.FC = ({ id: string; name: string; } | null>(null); + const [searchQuery, setSearchQuery] = useState(''); + const [searchResults, setSearchResults] = useState([]); const handleFileClick = (fileName: string) => { const fullPath = [...currentPath, fileName].join('/'); @@ -189,7 +198,7 @@ const FileTreeComponent: React.FC = ({ > left-arrow - +
source {sourceName} @@ -385,20 +394,116 @@ const FileTreeComponent: React.FC = ({ }; const currentDirectory = getCurrentDirectory(); + const searchFiles = (query: string, structure: DirectoryStructure, currentPath: string[] = []): SearchResult[] => { + let results: SearchResult[] = []; + + Object.entries(structure).forEach(([name, node]) => { + const fullPath = [...currentPath, name].join('/'); + + if (name.toLowerCase().includes(query.toLowerCase())) { + results.push({ + name, + path: fullPath, + isFile: !!node.type + }); + } + + if (!node.type) { + // If it's a directory, search recursively + results = [...results, ...searchFiles(query, node as DirectoryStructure, [...currentPath, name])]; + } + }); + + return results; + }; + + + const handleSearchSelect = (result: SearchResult) => { + if (result.isFile) { + const pathParts = result.path.split('/'); + const fileName = pathParts.pop() || ''; + setCurrentPath(pathParts); + + setSelectedFile({ + id: result.path, + name: fileName + }); + } else { + setCurrentPath(result.path.split('/')); + setSelectedFile(null); + } + setSearchQuery(''); + setSearchResults([]); + }; + return ( <>
{renderPathNavigation()}
{selectedFile ? ( - setSelectedFile(null)} - path={selectedFile.id} - showHeader={false} - /> +
+ {/* Search Panel */} +
+
+ { + setSearchQuery(e.target.value); + if (directoryStructure) { + setSearchResults(searchFiles(e.target.value, directoryStructure)); + } + }} + placeholder={t('settings.documents.searchFiles')} + className={`w-full px-4 py-2 pl-10 border border-[#D1D9E0] dark:border-[#6A6A6A] ${searchQuery ? 'rounded-t-md rounded-b-none border-b-0' : 'rounded-md' + } bg-transparent dark:text-[#E0E0E0] focus:outline-none`} + /> + + Search + + {searchQuery && ( +
+ {searchResults.map((result, index) => ( +
handleSearchSelect(result)} + className={`flex items-center px-3 py-2 cursor-pointer hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${index !== searchResults.length - 1 ? "border-b border-[#D1D9E0] dark:border-[#6A6A6A]" : "" + }`} + > + {result.isFile + + {result.path} + +
+ ))} + {searchResults.length === 0 && ( +
+ {t('settings.documents.noResults')} +
+ )} +
+ )} +
+
+
+ setSelectedFile(null)} + path={selectedFile.id} + showHeader={false} + /> +
+
) : (
-
@@ -429,3 +534,4 @@ const FileTreeComponent: React.FC = ({ }; export default FileTreeComponent; + From 3755316d4953fda778a05ddb77248c3dd9b97da3 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 21 Jul 2025 16:30:30 +0530 Subject: [PATCH 19/57] (fix:chunks) responsive design --- frontend/src/components/DocumentChunks.tsx | 64 +++++++++---------- frontend/src/components/FileTreeComponent.tsx | 58 +++++++++-------- 2 files changed, 63 insertions(+), 59 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index f994b6f7..1cdef63f 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -152,30 +152,30 @@ const DocumentChunks: React.FC = ({ return (
{showHeader && ( -
+
-
- source - +
+ source + {documentName} {pathParts.length > 0 && ( <> - / + / {pathParts.map((part, index) => ( - + {part} {index < pathParts.length - 1 && ( - / + / )} ))} @@ -185,8 +185,8 @@ const DocumentChunks: React.FC = ({
)} -
-
+
+
{totalChunks > 999999 ? `${(totalChunks / 1000000).toFixed(2)}M` @@ -206,7 +206,7 @@ const DocumentChunks: React.FC = ({
{loading ? ( -
-
- -
-
- ) : ( -
- {filteredChunks.length === 0 ? ( -
- {t('settings.documents.noChunksAlt')} - {t('settings.documents.noChunks')} -
- ) : ( - filteredChunks.map((chunk, index) => ( -
+
+ +
+) : ( +
+ {filteredChunks.length === 0 ? ( +
+ {t('settings.documents.noChunksAlt')} + {t('settings.documents.noChunks')} +
+ ) : ( + filteredChunks.map((chunk, index) => ( +
diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index 6a61ca12..64a8e4b1 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -436,6 +436,7 @@ const FileTreeComponent: React.FC = ({ setSearchResults([]); }; + return ( <>
{renderPathNavigation()}
@@ -465,24 +466,29 @@ const FileTreeComponent: React.FC = ({ /> {searchQuery && ( -
- {searchResults.map((result, index) => ( -
handleSearchSelect(result)} - className={`flex items-center px-3 py-2 cursor-pointer hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${index !== searchResults.length - 1 ? "border-b border-[#D1D9E0] dark:border-[#6A6A6A]" : "" - }`} - > - {result.isFile - - {result.path} - -
- ))} +
+ {searchResults.map((result, index) => { + const name = result.path.split('/').pop() || result.path; + + return ( +
handleSearchSelect(result)} + title={result.path} + className={`flex items-center px-3 py-2 cursor-pointer hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${index !== searchResults.length - 1 ? "border-b border-[#D1D9E0] dark:border-[#6A6A6A]" : "" + }`} + > + {result.isFile + + {name} + +
+ ); + })} {searchResults.length === 0 && (
{t('settings.documents.noResults')} @@ -493,14 +499,14 @@ const FileTreeComponent: React.FC = ({
- setSelectedFile(null)} - path={selectedFile.id} - showHeader={false} - /> -
+ setSelectedFile(null)} + path={selectedFile.id} + showHeader={false} + /> +
) : (
From ff3c7eb5fb7ed1abbd9fa16d208364ae4e4850cc Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 21 Jul 2025 16:31:42 +0530 Subject: [PATCH 20/57] (fix:delete_old) comply with storage abtrctn --- application/api/user/routes.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 68936dcc..d0df5b7f 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -473,7 +473,7 @@ class DeleteByIds(Resource): @user_ns.route("/api/delete_old") class DeleteOldIndexes(Resource): @api.doc( - description="Deletes old indexes", + description="Deletes old indexes and associated files", params={"source_id": "The source ID to delete"}, ) def get(self): @@ -490,21 +490,40 @@ class DeleteOldIndexes(Resource): ) if not doc: return make_response(jsonify({"status": "not found"}), 404) + + storage = StorageCreator.get_storage() + try: + # Delete vector index if settings.VECTOR_STORE == "faiss": - shutil.rmtree(os.path.join(current_dir, "indexes", str(doc["_id"]))) + index_path = f"indexes/{str(doc['_id'])}" + if storage.file_exists(f"{index_path}/index.faiss"): + storage.delete_file(f"{index_path}/index.faiss") + if storage.file_exists(f"{index_path}/index.pkl"): + storage.delete_file(f"{index_path}/index.pkl") else: vectorstore = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id=str(doc["_id"]) ) vectorstore.delete_index() + + if "file_path" in doc and doc["file_path"]: + file_path = doc["file_path"] + if storage.is_directory(file_path): + files = storage.list_files(file_path) + for f in files: + storage.delete_file(f) + else: + storage.delete_file(file_path) + except FileNotFoundError: pass except Exception as err: current_app.logger.error( - f"Error deleting old indexes: {err}", exc_info=True + f"Error deleting files and indexes: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) + sources_collection.delete_one({"_id": ObjectId(source_id)}) return make_response(jsonify({"success": True}), 200) From 6eb2c884a232ebfd21a9183e350b27347c05a1e1 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 22 Jul 2025 19:36:52 +0530 Subject: [PATCH 21/57] (refactor) separation in chunks/files view --- frontend/src/assets/search.svg | 3 + frontend/src/components/DocumentChunks.tsx | 316 +++++++++--------- frontend/src/components/FileTreeComponent.tsx | 186 ++++++----- frontend/src/settings/Documents.tsx | 1 - 4 files changed, 274 insertions(+), 232 deletions(-) create mode 100644 frontend/src/assets/search.svg diff --git a/frontend/src/assets/search.svg b/frontend/src/assets/search.svg new file mode 100644 index 00000000..2f3d57c4 --- /dev/null +++ b/frontend/src/assets/search.svg @@ -0,0 +1,3 @@ + + + diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index 1cdef63f..e4dbc4a8 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from 'react'; +import React, { useState, useEffect, useRef } from 'react'; import { useSelector } from 'react-redux'; import { useTranslation } from 'react-i18next'; import { selectToken } from '../preferences/preferenceSlice'; @@ -19,16 +19,16 @@ interface DocumentChunksProps { documentId: string; documentName?: string; handleGoBack: () => void; - showHeader?: boolean; path?: string; + renderFileSearch?: () => React.ReactNode; } const DocumentChunks: React.FC = ({ documentId, documentName, handleGoBack, - showHeader = true, path, + renderFileSearch }) => { const { t } = useTranslation(); const token = useSelector(selectToken); @@ -45,6 +45,8 @@ const DocumentChunks: React.FC = ({ chunk: ChunkType | null; }>({ state: 'INACTIVE', chunk: null }); + + const pathParts = path ? path.split('/') : []; const fetchChunks = () => { @@ -149,159 +151,173 @@ const DocumentChunks: React.FC = ({ .includes(searchTerm.toLowerCase()); }); - return ( -
- {showHeader && ( -
- - -
- source - - {documentName} - - - {pathParts.length > 0 && ( - <> - / - {pathParts.map((part, index) => ( - - - {part} - - {index < pathParts.length - 1 && ( - / - )} - - ))} - - )} -
-
- )} - -
-
-
- {totalChunks > 999999 - ? `${(totalChunks / 1000000).toFixed(2)}M` - : totalChunks > 999 - ? `${(totalChunks / 1000).toFixed(2)}K` - : totalChunks} {t('settings.documents.chunks')} -
-
-
- setSearchTerm(e.target.value)} - className="w-full h-full px-3 py-2 bg-transparent border-none outline-none font-normal text-[13.56px] leading-[100%] dark:text-[#E0E0E0]" - /> -
-
+ const renderPathNavigation = () => { + return ( +
-
- {loading ? ( -
- -
-) : ( -
- {filteredChunks.length === 0 ? ( -
- {t('settings.documents.noChunksAlt')} - {t('settings.documents.noChunks')} -
- ) : ( - filteredChunks.map((chunk, index) => ( -
-
-
-
- {chunk.metadata.token_count ? chunk.metadata.token_count.toLocaleString() : '-'} tokens -
- -
-
-

- {chunk.text} -

-
-
-
- )) + +
+ source + + {documentName} + + + {pathParts.length > 0 && ( + <> + / + {pathParts.map((part, index) => ( + + + {part} + + {index < pathParts.length - 1 && ( + / + )} + + ))} + )}
- )} +
+ ); + }; - {!loading && filteredChunks.length > 0 && ( - { - setPerPage(rows); - setPage(1); - }} - /> - )} + return ( +
+
+ {renderPathNavigation()} +
+
+ {renderFileSearch && ( + renderFileSearch() + )} - - {editModal.chunk && ( - - setEditModal((prev) => ({ ...prev, state })) - } - handleSubmit={(title, text) => { - handleUpdateChunk(title, text, editModal.chunk as ChunkType); - }} - originalText={editModal.chunk?.text ?? ''} - originalTitle={editModal.chunk?.metadata?.title ?? ''} - handleDelete={() => { - handleDeleteChunk(editModal.chunk as ChunkType); - }} - /> - )} -
- ); + {/* Right side: Chunks content */} +
+
+
+
+ {totalChunks > 999999 + ? `${(totalChunks / 1000000).toFixed(2)}M` + : totalChunks > 999 + ? `${(totalChunks / 1000).toFixed(2)}K` + : totalChunks} {t('settings.documents.chunks')} +
+
+
+ setSearchTerm(e.target.value)} + className="w-full h-full px-3 py-2 bg-transparent border-none outline-none font-normal text-[13.56px] leading-[100%] dark:text-[#E0E0E0]" + /> +
+
+ +
+ {loading ? ( +
+ +
+ ) : ( +
+ {filteredChunks.length === 0 ? ( +
+ {t('settings.documents.noChunksAlt')} + {t('settings.documents.noChunks')} +
+ ) : ( + filteredChunks.map((chunk, index) => ( +
+
+
+
+ {chunk.metadata.token_count ? chunk.metadata.token_count.toLocaleString() : '-'} tokens +
+ +
+
+

+ {chunk.text} +

+
+
+
+ )) + )} +
+ )} + + {!loading && filteredChunks.length > 0 && ( + { + setPerPage(rows); + setPage(1); + }} + /> + )} + + + {editModal.chunk && ( + + setEditModal((prev) => ({ ...prev, state })) + } + handleSubmit={(title, text) => { + handleUpdateChunk(title, text, editModal.chunk as ChunkType); + }} + originalText={editModal.chunk?.text ?? ''} + originalTitle={editModal.chunk?.metadata?.title ?? ''} + handleDelete={() => { + handleDeleteChunk(editModal.chunk as ChunkType); + }} + /> + )} +
+
+
+ ); }; -export default DocumentChunks; + export default DocumentChunks; diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index 64a8e4b1..8cdbb4f3 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -12,6 +12,7 @@ import EyeView from '../assets/eye-view.svg'; import OutlineSource from '../assets/outline-source.svg'; import Trash from '../assets/red-trash.svg'; import SearchIcon from '../assets/search.svg'; +import { useOutsideAlerter } from '../hooks'; interface FileNode { type?: string; @@ -58,6 +59,17 @@ const FileTreeComponent: React.FC = ({ } | null>(null); const [searchQuery, setSearchQuery] = useState(''); const [searchResults, setSearchResults] = useState([]); + const searchDropdownRef = useRef(null); + + useOutsideAlerter( + searchDropdownRef, + () => { + setSearchQuery(''); + setSearchResults([]); + }, + [], + false + ); const handleFileClick = (fileName: string) => { const fullPath = [...currentPath, fileName].join('/'); @@ -435,103 +447,115 @@ const FileTreeComponent: React.FC = ({ setSearchQuery(''); setSearchResults([]); }; + const renderFileSearch = () => { + return ( +
+
+ { + setSearchQuery(e.target.value); + if (directoryStructure) { + setSearchResults(searchFiles(e.target.value, directoryStructure)); + } + }} + placeholder={t('settings.documents.searchFiles')} + className={`w-full px-4 py-2 pl-10 border border-[#D1D9E0] dark:border-[#6A6A6A] ${ + searchQuery ? 'rounded-t-md rounded-b-none border-b-0' : 'rounded-md' + } bg-transparent dark:text-[#E0E0E0] focus:outline-none`} + /> + Search + + {searchQuery && ( +
+ {searchResults.length === 0 ? ( +
+ {t('settings.documents.noResults')} +
+ ) : ( + searchResults.map((result, index) => ( +
handleSearchSelect(result)} + title={result.path} + className={`flex items-center px-3 py-2 cursor-pointer hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${ + index !== searchResults.length - 1 ? "border-b border-[#D1D9E0] dark:border-[#6A6A6A]" : "" + }`} + > + {result.isFile + + {result.path.split('/').pop() || result.path} + +
+ )) + )} +
+ )} +
+
+ ); + }; return ( <> -
{renderPathNavigation()}
{selectedFile ? (
- {/* Search Panel */} -
-
- { - setSearchQuery(e.target.value); - if (directoryStructure) { - setSearchResults(searchFiles(e.target.value, directoryStructure)); - } - }} - placeholder={t('settings.documents.searchFiles')} - className={`w-full px-4 py-2 pl-10 border border-[#D1D9E0] dark:border-[#6A6A6A] ${searchQuery ? 'rounded-t-md rounded-b-none border-b-0' : 'rounded-md' - } bg-transparent dark:text-[#E0E0E0] focus:outline-none`} - /> - - Search - - {searchQuery && ( -
- {searchResults.map((result, index) => { - const name = result.path.split('/').pop() || result.path; - - return ( -
handleSearchSelect(result)} - title={result.path} - className={`flex items-center px-3 py-2 cursor-pointer hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${index !== searchResults.length - 1 ? "border-b border-[#D1D9E0] dark:border-[#6A6A6A]" : "" - }`} - > - {result.isFile - - {name} - -
- ); - })} - {searchResults.length === 0 && ( -
- {t('settings.documents.noResults')} -
- )} -
- )} -
-
setSelectedFile(null)} path={selectedFile.id} - showHeader={false} + renderFileSearch={renderFileSearch} />
) : ( -
-
-
- - - - - - - - - - {renderFileTree(currentDirectory)} - -
- Name - - Tokens - - Size - - Actions -
+
+
+ {renderPathNavigation()} +
+ +
+ {/* Left side: Search dropdown */} + {renderFileSearch()} + + {/* Right side: File table */} +
+
+ + + + + + + + + + + {renderFileTree(currentDirectory)} + +
+ Name + + Tokens + + Size + + Actions +
+
+
)} diff --git a/frontend/src/settings/Documents.tsx b/frontend/src/settings/Documents.tsx index 9a8e4195..30f71f26 100644 --- a/frontend/src/settings/Documents.tsx +++ b/frontend/src/settings/Documents.tsx @@ -272,7 +272,6 @@ export default function Documents({ documentId={documentToView.id || ''} documentName={documentToView.name} handleGoBack={() => setDocumentToView(undefined)} - showHeader={true} /> )}
From b00c4cc3b6efc9e25c3ed97fd133e2fa30cb6bea Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Jul 2025 02:22:56 +0530 Subject: [PATCH 22/57] (feat:chunk) editing mode --- frontend/src/components/DocumentChunks.tsx | 347 +++++++++++++-------- 1 file changed, 211 insertions(+), 136 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index e4dbc4a8..d2c402a9 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -39,11 +39,10 @@ const DocumentChunks: React.FC = ({ const [totalChunks, setTotalChunks] = useState(0); const [loading, setLoading] = useLoaderState(true); const [searchTerm, setSearchTerm] = useState(''); - const [addModal, setAddModal] = useState('INACTIVE'); - const [editModal, setEditModal] = useState<{ - state: ActiveState; - chunk: ChunkType | null; - }>({ state: 'INACTIVE', chunk: null }); + const [editingChunk, setEditingChunk] = useState(null); + const [editingTitle, setEditingTitle] = useState(''); + const [editingText, setEditingText] = useState(''); + const [isAddingChunk, setIsAddingChunk] = useState(false); @@ -132,7 +131,7 @@ const DocumentChunks: React.FC = ({ if (!response.ok) { throw new Error('Failed to delete chunk'); } - setEditModal({ state: 'INACTIVE', chunk: null }); + setEditingChunk(null); fetchChunks(); }); } catch (e) { @@ -153,36 +152,67 @@ const DocumentChunks: React.FC = ({ const renderPathNavigation = () => { return ( -
- +
+
+ -
- source - - {documentName} - +
+ source + + {documentName} + - {pathParts.length > 0 && ( - <> - / - {pathParts.map((part, index) => ( - - - {part} - - {index < pathParts.length - 1 && ( - / - )} - - ))} - - )} + {pathParts.length > 0 && ( + <> + / + {pathParts.map((part, index) => ( + + + {part} + + {index < pathParts.length - 1 && ( + / + )} + + ))} + + )} +
+ + {editingChunk && ( +
+ + + +
+ )}
); }; @@ -193,92 +223,161 @@ const DocumentChunks: React.FC = ({ {renderPathNavigation()}
- {renderFileSearch && ( - renderFileSearch() - )} + {renderFileSearch && renderFileSearch()} {/* Right side: Chunks content */}
-
-
-
- {totalChunks > 999999 - ? `${(totalChunks / 1000000).toFixed(2)}M` - : totalChunks > 999 - ? `${(totalChunks / 1000).toFixed(2)}K` - : totalChunks} {t('settings.documents.chunks')} + {!editingChunk && !isAddingChunk ? ( + <> +
+
+
+ {totalChunks > 999999 + ? `${(totalChunks / 1000000).toFixed(2)}M` + : totalChunks > 999 + ? `${(totalChunks / 1000).toFixed(2)}K` + : totalChunks} {t('settings.documents.chunks')} +
+
+
+ setSearchTerm(e.target.value)} + className="w-full h-full px-3 py-2 bg-transparent border-none outline-none font-normal text-[13.56px] leading-[100%] dark:text-[#E0E0E0]" + /> +
+
+
-
-
- setSearchTerm(e.target.value)} - className="w-full h-full px-3 py-2 bg-transparent border-none outline-none font-normal text-[13.56px] leading-[100%] dark:text-[#E0E0E0]" - /> -
-
- -
- {loading ? ( -
- -
- ) : ( -
- {filteredChunks.length === 0 ? ( -
- {t('settings.documents.noChunksAlt')} - {t('settings.documents.noChunks')} + {loading ? ( +
+
) : ( - filteredChunks.map((chunk, index) => ( -
-
-
-
- {chunk.metadata.token_count ? chunk.metadata.token_count.toLocaleString() : '-'} tokens -
- -
-
-

- {chunk.text} -

-
+
+ {filteredChunks.length === 0 ? ( +
+ {t('settings.documents.noChunksAlt')} + {t('settings.documents.noChunks')}
-
- )) + ) : ( + filteredChunks.map((chunk, index) => ( +
+
+
+
+ {chunk.metadata.token_count ? chunk.metadata.token_count.toLocaleString() : '-'} tokens +
+ +
+
+

+ {chunk.text} +

+
+
+
+ )) + )} +
)} + + ) : isAddingChunk ? ( + // Add new chunk view +
+
+
+ + + {t('settings.documents.addNewChunk')} + +
+
+ +
+
+ +
+
+ +
+ + +
+
+ ) : editingChunk && ( +
+
+
+
+ {editingChunk.metadata.token_count ? editingChunk.metadata.token_count.toLocaleString() : '-'} tokens +
+
+
+ +
+
)} - {!loading && filteredChunks.length > 0 && ( + {!loading && filteredChunks.length > 0 && !editingChunk && !isAddingChunk && ( = ({ }} /> )} - - - {editModal.chunk && ( - - setEditModal((prev) => ({ ...prev, state })) - } - handleSubmit={(title, text) => { - handleUpdateChunk(title, text, editModal.chunk as ChunkType); - }} - originalText={editModal.chunk?.text ?? ''} - originalTitle={editModal.chunk?.metadata?.title ?? ''} - handleDelete={() => { - handleDeleteChunk(editModal.chunk as ChunkType); - }} - /> - )}
-
-
- ); +
+
+ ); }; - export default DocumentChunks; +export default DocumentChunks; From 4d6ea401b570f998ce1988635274ba0e47841a1c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Jul 2025 03:36:18 +0530 Subject: [PATCH 23/57] (feat:chunks) line numbered editor --- frontend/src/components/DocumentChunks.tsx | 143 ++++++++++++++------- 1 file changed, 94 insertions(+), 49 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index d2c402a9..9f3f639f 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect, useRef } from 'react'; +import React, { useState, useEffect } from 'react'; import { useSelector } from 'react-redux'; import { useTranslation } from 'react-i18next'; import { selectToken } from '../preferences/preferenceSlice'; @@ -15,6 +15,67 @@ import { ChunkType } from '../settings/types'; import EditIcon from '../assets/edit.svg'; import Pagination from './DocumentPagination'; +interface LineNumberedTextareaProps { + value: string; + onChange: (value: string) => void; + placeholder?: string; + ariaLabel?: string; + className?: string; +} + +const LineNumberedTextarea: React.FC = ({ + value, + onChange, + placeholder, + ariaLabel, + className = '' +}) => { + const handleChange = (e: React.ChangeEvent) => { + onChange(e.target.value); + }; + + const lineHeight = 19.93; + const contentLines = value.split('\n').length; + const minLinesForDisplay = Math.ceil((typeof window !== 'undefined' ? window.innerHeight - 300 : 600) / lineHeight); + const totalLines = Math.max(contentLines, minLinesForDisplay); + + return ( +
+
+ {Array.from({ length: totalLines }, (_, i) => ( +
+ {i + 1} +
+ ))} +
+ -
-
- -
- - +
+
) : editingChunk && ( @@ -365,13 +411,12 @@ const DocumentChunks: React.FC = ({ {editingChunk.metadata.token_count ? editingChunk.metadata.token_count.toLocaleString() : '-'} tokens
-
- + onChange={setEditingText} + ariaLabel={t('modals.chunk.promptText')} + />
From ad2f0f89501796c0235e4166d1ad041a502db2a9 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Wed, 23 Jul 2025 20:47:36 +0530 Subject: [PATCH 24/57] (chore:chunks) i18n --- frontend/src/components/DocumentChunks.tsx | 43 ++++++++++++++++--- frontend/src/components/FileTreeComponent.tsx | 20 ++++----- frontend/src/locale/en.json | 14 +++++- frontend/src/locale/es.json | 14 +++++- frontend/src/locale/jp.json | 14 +++++- frontend/src/locale/ru.json | 14 +++++- frontend/src/locale/zh-TW.json | 14 +++++- frontend/src/locale/zh.json | 14 +++++- 8 files changed, 125 insertions(+), 22 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index 9f3f639f..436e70ac 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -10,6 +10,7 @@ import NoFilesDarkIcon from '../assets/no-files-dark.svg'; import OutlineSource from '../assets/outline-source.svg'; import Spinner from '../components/Spinner'; import ChunkModal from '../modals/ChunkModal'; +import ConfirmationModal from '../modals/ConfirmationModal'; import { ActiveState } from '../models/misc'; import { ChunkType } from '../settings/types'; import EditIcon from '../assets/edit.svg'; @@ -104,6 +105,8 @@ const DocumentChunks: React.FC = ({ const [editingTitle, setEditingTitle] = useState(''); const [editingText, setEditingText] = useState(''); const [isAddingChunk, setIsAddingChunk] = useState(false); + const [deleteModalState, setDeleteModalState] = useState('INACTIVE'); + const [chunkToDelete, setChunkToDelete] = useState(null); @@ -200,6 +203,24 @@ const DocumentChunks: React.FC = ({ } }; + const confirmDeleteChunk = (chunk: ChunkType) => { + setChunkToDelete(chunk); + setDeleteModalState('ACTIVE'); + }; + + const handleConfirmedDelete = () => { + if (chunkToDelete) { + handleDeleteChunk(chunkToDelete); + setDeleteModalState('INACTIVE'); + setChunkToDelete(null); + } + }; + + const handleCancelDelete = () => { + setDeleteModalState('INACTIVE'); + setChunkToDelete(null); + }; + useEffect(() => { fetchChunks(); }, [page, perPage, path]); @@ -251,8 +272,7 @@ const DocumentChunks: React.FC = ({
); }; diff --git a/frontend/src/components/FileTreeComponent.tsx b/frontend/src/components/FileTreeComponent.tsx index 8cdbb4f3..20d27e47 100644 --- a/frontend/src/components/FileTreeComponent.tsx +++ b/frontend/src/components/FileTreeComponent.tsx @@ -283,7 +283,7 @@ const FileTreeComponent: React.FC = ({
Parent folder .. @@ -312,7 +312,7 @@ const FileTreeComponent: React.FC = ({ >
- Folder + {t('settings.documents.folderAlt')} {name}
@@ -333,7 +333,7 @@ const FileTreeComponent: React.FC = ({ > Menu @@ -364,7 +364,7 @@ const FileTreeComponent: React.FC = ({ >
- File + {t('settings.documents.fileAlt')} {name}
@@ -383,7 +383,7 @@ const FileTreeComponent: React.FC = ({ > Menu @@ -490,7 +490,7 @@ const FileTreeComponent: React.FC = ({ > {result.isFile @@ -537,16 +537,16 @@ const FileTreeComponent: React.FC = ({ - Name + {t('settings.documents.fileName')} - Tokens + {t('settings.documents.tokens')} - Size + {t('settings.documents.size')} - Actions + {t('settings.documents.actions')} diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index b538cdde..3c529fc5 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -79,7 +79,18 @@ "noChunks": "No chunks found", "noChunksAlt": "No chunks found", "goToDocuments": "Go to Documents", - "uploadNew": "Upload new" + "uploadNew": "Upload new", + "searchFiles": "Search files...", + "noResults": "No results found", + "fileName": "Name", + "tokens": "Tokens", + "size": "Size", + "fileAlt": "File", + "folderAlt": "Folder", + "parentFolderAlt": "Parent folder", + "menuAlt": "Menu", + "tokensUnit": "tokens", + "editAlt": "Edit" }, "apiKeys": { "label": "Chatbots", @@ -257,6 +268,7 @@ "promptText": "Prompt Text", "update": "Update", "close": "Close", + "cancel": "Cancel", "delete": "Delete", "deleteConfirmation": "Are you sure you want to delete this chunk?" } diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index bcdebfcb..0b54582b 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -79,7 +79,18 @@ "noChunks": "No se encontraron fragmentos", "noChunksAlt": "No se encontraron fragmentos", "goToDocuments": "Ir a Documentos", - "uploadNew": "Subir nuevo" + "uploadNew": "Subir nuevo", + "searchFiles": "Buscar archivos...", + "noResults": "No se encontraron resultados", + "fileName": "Nombre", + "tokens": "Tokens", + "size": "Tamaño", + "fileAlt": "Archivo", + "folderAlt": "Carpeta", + "parentFolderAlt": "Carpeta padre", + "menuAlt": "Menú", + "tokensUnit": "tokens", + "editAlt": "Editar" }, "apiKeys": { "label": "Chatbots", @@ -257,6 +268,7 @@ "promptText": "Texto del prompt", "update": "Actualizar", "close": "Cerrar", + "cancel": "Cancelar", "delete": "Eliminar", "deleteConfirmation": "¿Estás seguro de que deseas eliminar este fragmento?" } diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index d004e0dc..42d7bb93 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -79,7 +79,18 @@ "noChunks": "チャンクが見つかりません", "noChunksAlt": "チャンクが見つかりません", "goToDocuments": "ドキュメントへ移動", - "uploadNew": "新規アップロード" + "uploadNew": "新規アップロード", + "searchFiles": "ファイルを検索...", + "noResults": "結果が見つかりません", + "fileName": "名前", + "tokens": "トークン", + "size": "サイズ", + "fileAlt": "ファイル", + "folderAlt": "フォルダ", + "parentFolderAlt": "親フォルダ", + "menuAlt": "メニュー", + "tokensUnit": "トークン", + "editAlt": "編集" }, "apiKeys": { "label": "チャットボット", @@ -257,6 +268,7 @@ "promptText": "プロンプトテキスト", "update": "更新", "close": "閉じる", + "cancel": "キャンセル", "delete": "削除", "deleteConfirmation": "このチャンクを削除してもよろしいですか?" } diff --git a/frontend/src/locale/ru.json b/frontend/src/locale/ru.json index 95c7a228..13ae6829 100644 --- a/frontend/src/locale/ru.json +++ b/frontend/src/locale/ru.json @@ -79,7 +79,18 @@ "noChunks": "Фрагменты не найдены", "noChunksAlt": "Фрагменты не найдены", "goToDocuments": "Перейти к документам", - "uploadNew": "Загрузить новый" + "uploadNew": "Загрузить новый", + "searchFiles": "Поиск файлов...", + "noResults": "Результаты не найдены", + "fileName": "Имя", + "tokens": "Токены", + "size": "Размер", + "fileAlt": "Файл", + "folderAlt": "Папка", + "parentFolderAlt": "Родительская папка", + "menuAlt": "Меню", + "tokensUnit": "токенов", + "editAlt": "Редактировать" }, "apiKeys": { "label": "API ключи", @@ -257,6 +268,7 @@ "promptText": "Текст подсказки", "update": "Обновить", "close": "Закрыть", + "cancel": "Отмена", "delete": "Удалить", "deleteConfirmation": "Вы уверены, что хотите удалить этот фрагмент?" } diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json index 36baa8b1..f590b63b 100644 --- a/frontend/src/locale/zh-TW.json +++ b/frontend/src/locale/zh-TW.json @@ -79,7 +79,18 @@ "noChunks": "未找到文本塊", "noChunksAlt": "未找到文本塊", "goToDocuments": "前往文件", - "uploadNew": "上傳新文件" + "uploadNew": "上傳新文件", + "searchFiles": "搜尋檔案...", + "noResults": "未找到結果", + "fileName": "名稱", + "tokens": "Token", + "size": "大小", + "fileAlt": "檔案", + "folderAlt": "資料夾", + "parentFolderAlt": "上層資料夾", + "menuAlt": "選單", + "tokensUnit": "Token", + "editAlt": "編輯" }, "apiKeys": { "label": "聊天機器人", @@ -257,6 +268,7 @@ "promptText": "提示文字", "update": "更新", "close": "關閉", + "cancel": "取消", "delete": "刪除", "deleteConfirmation": "您確定要刪除此區塊嗎?" } diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index 804f5fb2..c1a6b4a0 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -79,7 +79,18 @@ "noChunks": "未找到文本块", "noChunksAlt": "未找到文本块", "goToDocuments": "前往文档", - "uploadNew": "上传新文档" + "uploadNew": "上传新文档", + "searchFiles": "搜索文件...", + "noResults": "未找到结果", + "fileName": "名称", + "tokens": "令牌", + "size": "大小", + "fileAlt": "文件", + "folderAlt": "文件夹", + "parentFolderAlt": "父文件夹", + "menuAlt": "菜单", + "tokensUnit": "令牌", + "editAlt": "编辑" }, "apiKeys": { "label": "聊天机器人", @@ -257,6 +268,7 @@ "promptText": "提示文本", "update": "更新", "close": "关闭", + "cancel": "取消", "delete": "删除", "deleteConfirmation": "您确定要删除此块吗?" } From 58465ece65069467b16bb20f72e2c856b40180be Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 25 Jul 2025 01:43:50 +0530 Subject: [PATCH 25/57] (feat:chunks) server-side filter on search --- application/api/user/routes.py | 39 ++++++++++++++------ frontend/src/api/endpoints.ts | 5 ++- frontend/src/api/services/userService.ts | 3 +- frontend/src/components/DocumentChunks.tsx | 42 ++++++++++++++++------ 4 files changed, 66 insertions(+), 23 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 785b6b4c..604dff15 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3265,12 +3265,13 @@ class DeleteTool(Resource): @user_ns.route("/api/get_chunks") class GetChunks(Resource): @api.doc( - description="Retrieves chunks from a document, optionally filtered by file path", + description="Retrieves chunks from a document, optionally filtered by file path and search term", params={ "id": "The document ID", "page": "Page number for pagination", "per_page": "Number of chunks per page", - "path": "Optional: Filter chunks by relative file path" + "path": "Optional: Filter chunks by relative file path", + "search": "Optional: Search term to filter chunks by title or content" }, ) def get(self): @@ -3282,6 +3283,7 @@ class GetChunks(Resource): page = int(request.args.get("page", 1)) per_page = int(request.args.get("per_page", 10)) path = request.args.get("path") + search_term = request.args.get("search", "").strip().lower() if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) @@ -3294,20 +3296,35 @@ class GetChunks(Resource): store = get_vector_store(doc_id) chunks = store.get_chunks() - if path: - filtered_chunks = [] - for chunk in chunks: - metadata = chunk.get("metadata", {}) + filtered_chunks = [] + for chunk in chunks: + metadata = chunk.get("metadata", {}) + + if path: source = metadata.get("source", "") + path_match = False if isinstance(source, str) and source.endswith(path): - filtered_chunks.append(chunk) + path_match = True elif isinstance(source, list): for src in source: if isinstance(src, str) and src.endswith(path): - filtered_chunks.append(chunk) + path_match = True break - chunks = filtered_chunks + + if not path_match: + continue + + if search_term: + text_match = search_term in chunk.get("text", "").lower() + title_match = search_term in metadata.get("title", "").lower() + + if not (text_match or title_match): + continue + + filtered_chunks.append(chunk) + + chunks = filtered_chunks total_chunks = len(chunks) start = (page - 1) * per_page @@ -3321,7 +3338,8 @@ class GetChunks(Resource): "per_page": per_page, "total": total_chunks, "chunks": paginated_chunks, - "path": path if path else None + "path": path if path else None, + "search": search_term if search_term else None } ), 200, @@ -3330,7 +3348,6 @@ class GetChunks(Resource): current_app.logger.error(f"Error getting chunks: {e}", exc_info=True) return make_response(jsonify({"success": False}), 500) - @user_ns.route("/api/add_chunk") class AddChunk(Resource): @api.expect( diff --git a/frontend/src/api/endpoints.ts b/frontend/src/api/endpoints.ts index 37c52c41..2980da46 100644 --- a/frontend/src/api/endpoints.ts +++ b/frontend/src/api/endpoints.ts @@ -43,8 +43,11 @@ const endpoints = { page: number, per_page: number, path?: string, + search?: string, ) => - `/api/get_chunks?id=${docId}&page=${page}&per_page=${per_page}${path ? `&path=${encodeURIComponent(path)}` : ''}`, + `/api/get_chunks?id=${docId}&page=${page}&per_page=${per_page}${ + path ? `&path=${encodeURIComponent(path)}` : '' + }${search ? `&search=${encodeURIComponent(search)}` : ''}`, ADD_CHUNK: '/api/add_chunk', DELETE_CHUNK: (docId: string, chunkId: string) => `/api/delete_chunk?id=${docId}&chunk_id=${chunkId}`, diff --git a/frontend/src/api/services/userService.ts b/frontend/src/api/services/userService.ts index 43671657..a2968312 100644 --- a/frontend/src/api/services/userService.ts +++ b/frontend/src/api/services/userService.ts @@ -87,8 +87,9 @@ const userService = { perPage: number, token: string | null, path?: string, + search?: string, ): Promise => - apiClient.get(endpoints.USER.GET_CHUNKS(docId, page, perPage, path), token), + apiClient.get(endpoints.USER.GET_CHUNKS(docId, page, perPage, path, search), token), addChunk: (data: any, token: string | null): Promise => apiClient.post(endpoints.USER.ADD_CHUNK, data, token), deleteChunk: ( diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index 436e70ac..c11f3559 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -116,7 +116,7 @@ const DocumentChunks: React.FC = ({ setLoading(true); try { userService - .getDocumentChunks(documentId, page, perPage, token, path) + .getDocumentChunks(documentId, page, perPage, token, path, searchTerm) .then((response) => { if (!response.ok) { setLoading(false); @@ -131,10 +131,14 @@ const DocumentChunks: React.FC = ({ setTotalChunks(data.total); setPaginatedChunks(data.chunks); setLoading(false); + }) + .catch((error) => { + setLoading(false); + setPaginatedChunks([]); }); } catch (e) { - console.log(e); setLoading(false); + setPaginatedChunks([]); } }; @@ -221,16 +225,34 @@ const DocumentChunks: React.FC = ({ setChunkToDelete(null); }; + useEffect(() => { + const delayDebounceFn = setTimeout(() => { + if (page !== 1) { + setPage(1); + } else { + fetchChunks(); + } + }, 300); + + return () => clearTimeout(delayDebounceFn); + }, [searchTerm]); useEffect(() => { fetchChunks(); }, [page, perPage, path]); + useEffect(() => { + setSearchTerm(''); + setPage(1); + }, [path]); + // Remove the client-side filtering + // const filteredChunks = paginatedChunks.filter((chunk) => { + // if (!chunk.metadata?.title) return true; + // return chunk.metadata.title + // .toLowerCase() + // .includes(searchTerm.toLowerCase()); + // }); - const filteredChunks = paginatedChunks.filter((chunk) => { - if (!chunk.metadata?.title) return true; - return chunk.metadata.title - .toLowerCase() - .includes(searchTerm.toLowerCase()); - }); + // Use the server-filtered chunks directly + const filteredChunks = paginatedChunks; const renderPathNavigation = () => { return ( @@ -367,7 +389,7 @@ const DocumentChunks: React.FC = ({
) : ( -
+
{filteredChunks.length === 0 ? (
= ({
)} - {!loading && filteredChunks.length > 0 && !editingChunk && !isAddingChunk && ( + {!loading && totalChunks > perPage && !editingChunk && !isAddingChunk && ( Date: Fri, 25 Jul 2025 04:05:06 +0530 Subject: [PATCH 26/57] (feat:chunks) ask to edit, ui --- frontend/src/components/DocumentChunks.tsx | 125 ++++++++++++--------- 1 file changed, 74 insertions(+), 51 deletions(-) diff --git a/frontend/src/components/DocumentChunks.tsx b/frontend/src/components/DocumentChunks.tsx index c11f3559..ccdd5129 100644 --- a/frontend/src/components/DocumentChunks.tsx +++ b/frontend/src/components/DocumentChunks.tsx @@ -22,6 +22,7 @@ interface LineNumberedTextareaProps { placeholder?: string; ariaLabel?: string; className?: string; + editable?: boolean; } const LineNumberedTextarea: React.FC = ({ @@ -29,7 +30,8 @@ const LineNumberedTextarea: React.FC = ({ onChange, placeholder, ariaLabel, - className = '' + className = '', + editable = true }) => { const handleChange = (e: React.ChangeEvent) => { onChange(e.target.value); @@ -61,18 +63,29 @@ const LineNumberedTextarea: React.FC = ({
))}
-