From e602d941cac3175cf624584d8003839932a8e053 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 5 Feb 2026 19:40:35 +0000 Subject: [PATCH] fix: sources display (#2274) * fix: sources display * fix: sources display2 --- application/api/internal/routes.py | 77 +++++++++++++---------- application/api/user/sources/upload.py | 56 ++++++++++++++++- application/api/user/tasks.py | 15 ++++- application/worker.py | 76 +++++++++++++++++++++- frontend/src/components/Chunks.tsx | 10 ++- frontend/src/components/ConnectorTree.tsx | 72 +++++++++++++++++---- frontend/src/components/FileTree.tsx | 72 +++++++++++++++++---- frontend/src/upload/Upload.tsx | 12 +++- 8 files changed, 326 insertions(+), 64 deletions(-) diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py index f812a37f..8a771585 100755 --- a/application/api/internal/routes.py +++ b/application/api/internal/routes.py @@ -61,6 +61,7 @@ def upload_index_files(): file_path = request.form.get("file_path") directory_structure = request.form.get("directory_structure") + file_name_map = request.form.get("file_name_map") if directory_structure: try: @@ -70,6 +71,14 @@ def upload_index_files(): directory_structure = {} else: directory_structure = {} + if file_name_map: + try: + file_name_map = json.loads(file_name_map) + except Exception: + logger.error("Error parsing file_name_map") + file_name_map = None + else: + file_name_map = None storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" @@ -97,41 +106,43 @@ def upload_index_files(): existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: + update_fields = { + "user": user, + "name": job_name, + "language": job_name, + "date": datetime.datetime.now(), + "model": settings.EMBEDDINGS_NAME, + "type": type, + "tokens": tokens, + "retriever": retriever, + "remote_data": remote_data, + "sync_frequency": sync_frequency, + "file_path": file_path, + "directory_structure": directory_structure, + } + if file_name_map is not None: + update_fields["file_name_map"] = file_name_map sources_collection.update_one( {"_id": ObjectId(id)}, - { - "$set": { - "user": user, - "name": job_name, - "language": job_name, - "date": datetime.datetime.now(), - "model": settings.EMBEDDINGS_NAME, - "type": type, - "tokens": tokens, - "retriever": retriever, - "remote_data": remote_data, - "sync_frequency": sync_frequency, - "file_path": file_path, - "directory_structure": directory_structure, - } - }, + {"$set": update_fields}, ) else: - sources_collection.insert_one( - { - "_id": ObjectId(id), - "user": user, - "name": job_name, - "language": job_name, - "date": datetime.datetime.now(), - "model": settings.EMBEDDINGS_NAME, - "type": type, - "tokens": tokens, - "retriever": retriever, - "remote_data": remote_data, - "sync_frequency": sync_frequency, - "file_path": file_path, - "directory_structure": directory_structure, - } - ) + insert_doc = { + "_id": ObjectId(id), + "user": user, + "name": job_name, + "language": job_name, + "date": datetime.datetime.now(), + "model": settings.EMBEDDINGS_NAME, + "type": type, + "tokens": tokens, + "retriever": retriever, + "remote_data": remote_data, + "sync_frequency": sync_frequency, + "file_path": file_path, + "directory_structure": directory_structure, + } + if file_name_map is not None: + insert_doc["file_name_map"] = file_name_map + sources_collection.insert_one(insert_doc) return {"status": "ok"} diff --git a/application/api/user/sources/upload.py b/application/api/user/sources/upload.py index 0d2f43c8..313f9e7f 100644 --- a/application/api/user/sources/upload.py +++ b/application/api/user/sources/upload.py @@ -64,13 +64,16 @@ class UploadFile(Resource): safe_user = safe_filename(user) dir_name = safe_filename(job_name) base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" + file_name_map = {} try: storage = StorageCreator.get_storage() for file in files: - original_filename = file.filename + original_filename = os.path.basename(file.filename) safe_file = safe_filename(original_filename) + if original_filename: + file_name_map[safe_file] = original_filename with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = os.path.join(temp_dir, safe_file) @@ -142,6 +145,7 @@ class UploadFile(Resource): user, file_path=base_path, filename=dir_name, + file_name_map=file_name_map, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) @@ -341,6 +345,14 @@ class ManageSourceFiles(Resource): storage = StorageCreator.get_storage() source_file_path = source.get("file_path", "") parent_dir = request.form.get("parent_dir", "") + file_name_map = source.get("file_name_map") or {} + if isinstance(file_name_map, str): + try: + file_name_map = json.loads(file_name_map) + except Exception: + file_name_map = {} + if not isinstance(file_name_map, dict): + file_name_map = {} if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir): return make_response( @@ -362,19 +374,35 @@ class ManageSourceFiles(Resource): 400, ) added_files = [] + map_updated = False target_dir = source_file_path if parent_dir: target_dir = f"{source_file_path}/{parent_dir}" for file in files: if file.filename: - safe_filename_str = safe_filename(file.filename) + original_filename = os.path.basename(file.filename) + safe_filename_str = safe_filename(original_filename) file_path = f"{target_dir}/{safe_filename_str}" # Save file to storage storage.save_file(file, file_path) added_files.append(safe_filename_str) + if original_filename: + relative_key = ( + f"{parent_dir}/{safe_filename_str}" + if parent_dir + else safe_filename_str + ) + file_name_map[relative_key] = original_filename + map_updated = True + + if map_updated: + sources_collection.update_one( + {"_id": ObjectId(source_id)}, + {"$set": {"file_name_map": file_name_map}}, + ) # Trigger re-ingestion pipeline from application.api.user.tasks import reingest_source_task @@ -421,6 +449,7 @@ class ManageSourceFiles(Resource): # Remove files from storage and directory structure removed_files = [] + map_updated = False for file_path in file_paths: full_path = f"{source_file_path}/{file_path}" @@ -429,6 +458,15 @@ class ManageSourceFiles(Resource): if storage.file_exists(full_path): storage.delete_file(full_path) removed_files.append(file_path) + if file_path in file_name_map: + file_name_map.pop(file_path, None) + map_updated = True + + if map_updated and isinstance(file_name_map, dict): + sources_collection.update_one( + {"_id": ObjectId(source_id)}, + {"$set": {"file_name_map": file_name_map}}, + ) # Trigger re-ingestion pipeline from application.api.user.tasks import reingest_source_task @@ -511,6 +549,20 @@ class ManageSourceFiles(Resource): f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, " f"Full path: {full_directory_path}" ) + if directory_path and file_name_map: + prefix = f"{directory_path.rstrip('/')}/" + keys_to_remove = [ + key + for key in file_name_map.keys() + if key == directory_path or key.startswith(prefix) + ] + if keys_to_remove: + for key in keys_to_remove: + file_name_map.pop(key, None) + sources_collection.update_one( + {"_id": ObjectId(source_id)}, + {"$set": {"file_name_map": file_name_map}}, + ) # Trigger re-ingestion pipeline diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index 1011be00..717e2fa4 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -14,8 +14,19 @@ from application.worker import ( @celery.task(bind=True) -def ingest(self, directory, formats, job_name, user, file_path, filename): - resp = ingest_worker(self, directory, formats, job_name, file_path, filename, user) +def ingest( + self, directory, formats, job_name, user, file_path, filename, file_name_map=None +): + resp = ingest_worker( + self, + directory, + formats, + job_name, + file_path, + filename, + user, + file_name_map=file_name_map, + ) return resp diff --git a/application/worker.py b/application/worker.py index a4265bc1..d746396c 100755 --- a/application/worker.py +++ b/application/worker.py @@ -52,6 +52,41 @@ def metadata_from_filename(title): return {"title": title} +def _normalize_file_name_map(file_name_map): + if not file_name_map: + return {} + if isinstance(file_name_map, str): + try: + file_name_map = json.loads(file_name_map) + except Exception: + return {} + return file_name_map if isinstance(file_name_map, dict) else {} + + +def _get_display_name(file_name_map, rel_path): + if not file_name_map or not rel_path: + return None + if rel_path in file_name_map: + return file_name_map[rel_path] + base_name = os.path.basename(rel_path) + return file_name_map.get(base_name) + + +def _apply_display_names_to_structure(structure, file_name_map, prefix=""): + if not isinstance(structure, dict) or not file_name_map: + return structure + for name, node in structure.items(): + if isinstance(node, dict) and "type" in node and "size_bytes" in node: + rel_path = f"{prefix}/{name}" if prefix else name + display_name = _get_display_name(file_name_map, rel_path) + if display_name: + node["display_name"] = display_name + elif isinstance(node, dict): + next_prefix = f"{prefix}/{name}" if prefix else name + _apply_display_names_to_structure(node, file_name_map, next_prefix) + return structure + + # Define a function to generate a random string of a given length. @@ -375,7 +410,15 @@ def run_agent_logic(agent_config, input_data): def ingest_worker( - self, directory, formats, job_name, file_path, filename, user, retriever="classic" + self, + directory, + formats, + job_name, + file_path, + filename, + user, + retriever="classic", + file_name_map=None, ): """ Ingest and process documents. @@ -389,6 +432,7 @@ def ingest_worker( filename (str): Original unsanitized filename provided by the user. user (str): Identifier for the user initiating the ingestion (original, unsanitized). retriever (str): Type of retriever to use for processing the documents. + file_name_map (dict|str|None): Optional mapping of safe relative paths to original filenames. Returns: dict: Information about the completed ingestion task, including input parameters and a "limited" flag. @@ -468,6 +512,22 @@ def ingest_worker( directory_structure = getattr(reader, "directory_structure", {}) logging.info(f"Directory structure from reader: {directory_structure}") + file_name_map = _normalize_file_name_map(file_name_map) + if file_name_map: + for doc in raw_docs: + extra_info = getattr(doc, "extra_info", None) + if not isinstance(extra_info, dict): + continue + rel_path = extra_info.get("source") or extra_info.get("file_path") + display_name = _get_display_name(file_name_map, rel_path) + if display_name: + display_name = str(display_name) + extra_info["filename"] = display_name + extra_info["file_name"] = display_name + extra_info["title"] = display_name + directory_structure = _apply_display_names_to_structure( + directory_structure, file_name_map + ) chunker = Chunker( chunking_strategy="classic_chunk", @@ -504,6 +564,8 @@ def ingest_worker( "file_path": file_path, "directory_structure": json.dumps(directory_structure), } + if file_name_map: + file_data["file_name_map"] = json.dumps(file_name_map) upload_index(vector_store_path, file_data) except Exception as e: @@ -547,6 +609,7 @@ def reingest_source_worker(self, source_id, user): storage = StorageCreator.get_storage() source_file_path = source.get("file_path", "") + file_name_map = _normalize_file_name_map(source.get("file_name_map")) self.update_state( state="PROGRESS", meta={"current": 20, "status": "Scanning current files"} @@ -781,6 +844,14 @@ def reingest_source_worker(self, source_id, user): ) except Exception: pass + display_name = _get_display_name( + file_name_map, meta.get("source") + ) + if display_name: + display_name = str(display_name) + meta["filename"] = display_name + meta["file_name"] = display_name + meta["title"] = display_name vector_store.add_chunk(d.text, metadata=meta) added += 1 @@ -795,6 +866,9 @@ def reingest_source_worker(self, source_id, user): # 3) Update source directory structure timestamp try: total_tokens = sum(reader.file_token_counts.values()) + directory_structure = _apply_display_names_to_structure( + directory_structure, file_name_map + ) sources_collection.update_one( {"_id": ObjectId(source_id)}, diff --git a/frontend/src/components/Chunks.tsx b/frontend/src/components/Chunks.tsx index bdb1c4c5..060b6dfc 100644 --- a/frontend/src/components/Chunks.tsx +++ b/frontend/src/components/Chunks.tsx @@ -92,6 +92,7 @@ const LineNumberedTextarea: React.FC = ({ interface SearchResult { path: string; isFile: boolean; + name?: string; } interface ChunksProps { @@ -99,6 +100,7 @@ interface ChunksProps { documentName?: string; handleGoBack: () => void; path?: string; + displayPath?: string; onFileSearch?: (query: string) => SearchResult[]; onFileSelect?: (path: string) => void; } @@ -108,6 +110,7 @@ const Chunks: React.FC = ({ documentName, handleGoBack, path, + displayPath, onFileSearch, onFileSelect, }) => { @@ -134,7 +137,8 @@ const Chunks: React.FC = ({ const [chunkToDelete, setChunkToDelete] = useState(null); const [isEditing, setIsEditing] = useState(false); - const pathParts = path ? path.split('/') : []; + const displayPathValue = displayPath ?? path ?? ''; + const pathParts = displayPathValue ? displayPathValue.split('/') : []; const fetchChunks = async () => { setLoading(true); @@ -515,7 +519,9 @@ const Chunks: React.FC = ({ className="mr-2 h-4 w-4 flex-shrink-0" /> - {result.path.split('/').pop() || result.path} + {result.name || + result.path.split('/').pop() || + result.path} )) diff --git a/frontend/src/components/ConnectorTree.tsx b/frontend/src/components/ConnectorTree.tsx index 3d823022..a05ec38b 100644 --- a/frontend/src/components/ConnectorTree.tsx +++ b/frontend/src/components/ConnectorTree.tsx @@ -31,6 +31,7 @@ interface FileNode { type?: string; token_count?: number; size_bytes?: number; + display_name?: string; [key: string]: any; } @@ -90,11 +91,11 @@ const ConnectorTree: React.FC = ({ false, ); - const handleFileClick = (fileName: string) => { + const handleFileClick = (fileName: string, displayName?: string) => { const fullPath = [...currentPath, fileName].join('/'); setSelectedFile({ id: fullPath, - name: fileName, + name: displayName ?? fileName, }); }; @@ -261,6 +262,7 @@ const ConnectorTree: React.FC = ({ name: string, isFile: boolean, _itemId: string, + displayName?: string, ): MenuOption[] => { const options: MenuOption[] = []; @@ -270,7 +272,7 @@ const ConnectorTree: React.FC = ({ onClick: (event: React.SyntheticEvent) => { event.stopPropagation(); if (isFile) { - handleFileClick(name); + handleFileClick(name, displayName); } else { navigateToDirectory(name); } @@ -495,9 +497,16 @@ const ConnectorTree: React.FC = ({ .map(([name, node]) => { const itemId = `file-${name}`; const menuRef = getMenuRef(itemId); + const displayName = + typeof node.display_name === 'string' && node.display_name.trim() + ? node.display_name + : name; return ( - handleFileClick(name)}> + handleFileClick(name, displayName)} + >
= ({ alt={t('settings.sources.fileAlt')} className="mr-2 h-4 w-4 flex-shrink-0" /> - {name} + {displayName}
@@ -532,7 +541,7 @@ const ConnectorTree: React.FC = ({ setIsOpen={(isOpen) => setActiveMenuId(isOpen ? itemId : null) } - options={getActionOptions(name, true, itemId)} + options={getActionOptions(name, true, itemId, displayName)} anchorRef={menuRef} position="bottom-left" offset={{ x: -4, y: 4 }} @@ -555,10 +564,16 @@ const ConnectorTree: React.FC = ({ Object.entries(structure).forEach(([name, node]) => { const fullPath = [...currentPath, name].join('/'); + const displayName = + typeof node.display_name === 'string' && node.display_name.trim() + ? node.display_name + : ''; + const queryLower = query.toLowerCase(); + const matchTarget = displayName ? `${name} ${displayName}` : name; - if (name.toLowerCase().includes(query.toLowerCase())) { + if (matchTarget.toLowerCase().includes(queryLower)) { results.push({ - name, + name: displayName || name, path: fullPath, isFile: !!node.type, }); @@ -587,7 +602,7 @@ const ConnectorTree: React.FC = ({ setSelectedFile({ id: result.path, - name: fileName, + name: result.name || fileName, }); } else { setCurrentPath(result.path.split('/')); @@ -642,7 +657,7 @@ const ConnectorTree: React.FC = ({ className="mr-2 h-4 w-4 flex-shrink-0" /> - {result.path.split('/').pop() || result.path} + {result.name} )) @@ -661,13 +676,47 @@ const ConnectorTree: React.FC = ({ return []; }; + const getDisplayNameForPath = (path: string) => { + if (!directoryStructure) { + return path.split('/').pop() || path; + } + let structure: any = directoryStructure; + if (typeof structure === 'string') { + try { + structure = JSON.parse(structure); + } catch (e) { + return path.split('/').pop() || path; + } + } + if (typeof structure !== 'object' || structure === null) { + return path.split('/').pop() || path; + } + const parts = path.split('/').filter(Boolean); + let current: any = structure; + for (const part of parts) { + if (!current || typeof current !== 'object') { + return parts[parts.length - 1] || path; + } + current = current[part]; + } + if ( + current && + typeof current === 'object' && + typeof current.display_name === 'string' && + current.display_name.trim() + ) { + return current.display_name; + } + return parts[parts.length - 1] || path; + }; + const handleFileSelect = (path: string) => { const pathParts = path.split('/'); const fileName = pathParts.pop() || ''; setCurrentPath(pathParts); setSelectedFile({ id: path, - name: fileName, + name: getDisplayNameForPath(path) || fileName, }); }; @@ -687,6 +736,7 @@ const ConnectorTree: React.FC = ({ documentName={sourceName} handleGoBack={() => setSelectedFile(null)} path={selectedFile.id} + displayPath={[...currentPath, selectedFile.name].join('/')} onFileSearch={handleFileSearch} onFileSelect={handleFileSelect} /> diff --git a/frontend/src/components/FileTree.tsx b/frontend/src/components/FileTree.tsx index 039d7eeb..0fa0e7a0 100644 --- a/frontend/src/components/FileTree.tsx +++ b/frontend/src/components/FileTree.tsx @@ -29,6 +29,7 @@ interface FileNode { type?: string; token_count?: number; size_bytes?: number; + display_name?: string; [key: string]: any; } @@ -104,11 +105,11 @@ const FileTree: React.FC = ({ false, ); - const handleFileClick = (fileName: string) => { + const handleFileClick = (fileName: string, displayName?: string) => { const fullPath = [...currentPath, fileName].join('/'); setSelectedFile({ id: fullPath, - name: fileName, + name: displayName ?? fileName, }); }; @@ -214,6 +215,7 @@ const FileTree: React.FC = ({ name: string, isFile: boolean, _itemId: string, + displayName?: string, ): MenuOption[] => { const options: MenuOption[] = []; @@ -223,7 +225,7 @@ const FileTree: React.FC = ({ onClick: (event: React.SyntheticEvent) => { event.stopPropagation(); if (isFile) { - handleFileClick(name); + handleFileClick(name, displayName); } else { navigateToDirectory(name); } @@ -624,9 +626,16 @@ const FileTree: React.FC = ({ ...files.map(([name, node]) => { const itemId = `file-${name}`; const menuRef = getMenuRef(itemId); + const displayName = + typeof node.display_name === 'string' && node.display_name.trim() + ? node.display_name + : name; return ( - handleFileClick(name)}> + handleFileClick(name, displayName)} + >
= ({ alt={t('settings.sources.fileAlt')} className="mr-2 h-4 w-4 flex-shrink-0" /> - {name} + {displayName}
@@ -661,7 +670,7 @@ const FileTree: React.FC = ({ setIsOpen={(isOpen) => setActiveMenuId(isOpen ? itemId : null) } - options={getActionOptions(name, true, itemId)} + options={getActionOptions(name, true, itemId, displayName)} anchorRef={menuRef} position="bottom-left" offset={{ x: -4, y: 4 }} @@ -684,10 +693,16 @@ const FileTree: React.FC = ({ Object.entries(structure).forEach(([name, node]) => { const fullPath = [...currentPath, name].join('/'); + const displayName = + typeof node.display_name === 'string' && node.display_name.trim() + ? node.display_name + : ''; + const queryLower = query.toLowerCase(); + const matchTarget = displayName ? `${name} ${displayName}` : name; - if (name.toLowerCase().includes(query.toLowerCase())) { + if (matchTarget.toLowerCase().includes(queryLower)) { results.push({ - name, + name: displayName || name, path: fullPath, isFile: !!node.type, }); @@ -716,7 +731,7 @@ const FileTree: React.FC = ({ setSelectedFile({ id: result.path, - name: fileName, + name: result.name || fileName, }); } else { setCurrentPath(result.path.split('/')); @@ -771,7 +786,7 @@ const FileTree: React.FC = ({ className="mr-2 h-4 w-4 flex-shrink-0" /> - {result.path.split('/').pop() || result.path} + {result.name} )) @@ -790,13 +805,47 @@ const FileTree: React.FC = ({ return []; }; + const getDisplayNameForPath = (path: string) => { + if (!directoryStructure) { + return path.split('/').pop() || path; + } + let structure: any = directoryStructure; + if (typeof structure === 'string') { + try { + structure = JSON.parse(structure); + } catch (e) { + return path.split('/').pop() || path; + } + } + if (typeof structure !== 'object' || structure === null) { + return path.split('/').pop() || path; + } + const parts = path.split('/').filter(Boolean); + let current: any = structure; + for (const part of parts) { + if (!current || typeof current !== 'object') { + return parts[parts.length - 1] || path; + } + current = current[part]; + } + if ( + current && + typeof current === 'object' && + typeof current.display_name === 'string' && + current.display_name.trim() + ) { + return current.display_name; + } + return parts[parts.length - 1] || path; + }; + const handleFileSelect = (path: string) => { const pathParts = path.split('/'); const fileName = pathParts.pop() || ''; setCurrentPath(pathParts); setSelectedFile({ id: path, - name: fileName, + name: getDisplayNameForPath(path) || fileName, }); }; @@ -810,6 +859,7 @@ const FileTree: React.FC = ({ documentName={sourceName} handleGoBack={() => setSelectedFile(null)} path={selectedFile.id} + displayPath={[...currentPath, selectedFile.name].join('/')} onFileSearch={handleFileSearch} onFileSelect={handleFileSelect} /> diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 21341f16..33cadd06 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -263,6 +263,7 @@ function Upload({ name: '', config: {}, })); + const [nameTouched, setNameTouched] = useState(false); const { t } = useTranslation(); const dispatch = useDispatch(); @@ -284,6 +285,7 @@ function Upload({ setSelectedFiles([]); setSelectedFolders([]); setShowAdvancedOptions(false); + setNameTouched(false); }, []); const handleTaskFailure = useCallback( @@ -410,7 +412,10 @@ function Upload({ const onDrop = useCallback( (acceptedFiles: File[]) => { setfiles(acceptedFiles); - setIngestor((prev) => ({ ...prev, name: acceptedFiles[0]?.name || '' })); + const pickedName = acceptedFiles[0]?.name; + if (!nameTouched && pickedName) { + setIngestor((prev) => ({ ...prev, name: pickedName })); + } // If we're in local_file mode, update the ingestor config if (ingestor.type === 'local_file') { @@ -423,7 +428,7 @@ function Upload({ })); } }, - [ingestor.type], + [ingestor.type, nameTouched], ); const doNothing = () => undefined; @@ -772,6 +777,7 @@ function Upload({ config: {}, }); setfiles([]); + setNameTouched(false); return; } @@ -781,6 +787,7 @@ function Upload({ name: defaultConfig.name, config: defaultConfig.config, }); + setNameTouched(false); // Clear files if switching away from local_file if (type !== 'local_file') { @@ -860,6 +867,7 @@ function Upload({ colorVariant="silver" value={ingestor.name} onChange={(e) => { + setNameTouched(true); setIngestor((prevState) => ({ ...prevState, name: e.target.value,