Merge pull request #1873 from ManishMadan2882/main

Sources are the new Docs
This commit is contained in:
Alex
2025-08-13 18:24:35 +01:00
committed by GitHub
40 changed files with 3412 additions and 1119 deletions

View File

@@ -3,11 +3,11 @@ import json
import math
import os
import secrets
import shutil
import uuid
from functools import wraps
from typing import Optional, Tuple
import tempfile
import zipfile
from bson.binary import Binary, UuidRepresentation
from bson.dbref import DBRef
from bson.objectid import ObjectId
@@ -44,6 +44,7 @@ from application.utils import (
validate_function_name,
validate_required_fields,
)
from application.utils import num_tokens_from_string
from application.vectorstore.vector_creator import VectorCreator
storage = StorageCreator.get_storage()
@@ -474,7 +475,7 @@ class DeleteByIds(Resource):
@user_ns.route("/api/delete_old")
class DeleteOldIndexes(Resource):
@api.doc(
description="Deletes old indexes",
description="Deletes old indexes and associated files",
params={"source_id": "The source ID to delete"},
)
def get(self):
@@ -491,21 +492,40 @@ class DeleteOldIndexes(Resource):
)
if not doc:
return make_response(jsonify({"status": "not found"}), 404)
storage = StorageCreator.get_storage()
try:
# Delete vector index
if settings.VECTOR_STORE == "faiss":
shutil.rmtree(os.path.join(current_dir, "indexes", str(doc["_id"])))
index_path = f"indexes/{str(doc['_id'])}"
if storage.file_exists(f"{index_path}/index.faiss"):
storage.delete_file(f"{index_path}/index.faiss")
if storage.file_exists(f"{index_path}/index.pkl"):
storage.delete_file(f"{index_path}/index.pkl")
else:
vectorstore = VectorCreator.create_vectorstore(
settings.VECTOR_STORE, source_id=str(doc["_id"])
)
vectorstore.delete_index()
if "file_path" in doc and doc["file_path"]:
file_path = doc["file_path"]
if storage.is_directory(file_path):
files = storage.list_files(file_path)
for f in files:
storage.delete_file(f)
else:
storage.delete_file(file_path)
except FileNotFoundError:
pass
except Exception as err:
current_app.logger.error(
f"Error deleting old indexes: {err}", exc_info=True
f"Error deleting files and indexes: {err}", exc_info=True
)
return make_response(jsonify({"success": False}), 400)
sources_collection.delete_one({"_id": ObjectId(source_id)})
return make_response(jsonify({"success": True}), 200)
@@ -549,146 +569,276 @@ class UploadFile(Resource):
# Create safe versions for filesystem operations
safe_user = safe_filename(user)
dir_name = safe_filename(job_name)
base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
try:
storage = StorageCreator.get_storage()
base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}"
if len(files) > 1:
temp_files = []
for file in files:
filename = safe_filename(file.filename)
temp_path = f"{base_path}/temp/{filename}"
storage.save_file(file, temp_path)
temp_files.append(temp_path)
print(f"Saved file: {filename}")
zip_filename = f"{dir_name}.zip"
zip_path = f"{base_path}/{zip_filename}"
zip_temp_path = None
def create_zip_archive(temp_paths, dir_name, storage):
import tempfile
with tempfile.NamedTemporaryFile(
delete=False, suffix=".zip"
) as temp_zip_file:
zip_output_path = temp_zip_file.name
with tempfile.TemporaryDirectory() as stage_dir:
for path in temp_paths:
try:
file_data = storage.get_file(path)
with open(
os.path.join(stage_dir, os.path.basename(path)),
"wb",
) as f:
f.write(file_data.read())
except Exception as e:
current_app.logger.error(
f"Error processing file {path} for zipping: {e}",
exc_info=True,
)
if os.path.exists(zip_output_path):
os.remove(zip_output_path)
raise
for file in files:
original_filename = file.filename
safe_file = safe_filename(original_filename)
with tempfile.TemporaryDirectory() as temp_dir:
temp_file_path = os.path.join(temp_dir, safe_file)
file.save(temp_file_path)
if zipfile.is_zipfile(temp_file_path):
try:
shutil.make_archive(
base_name=zip_output_path.replace(".zip", ""),
format="zip",
root_dir=stage_dir,
)
with zipfile.ZipFile(temp_file_path, 'r') as zip_ref:
zip_ref.extractall(path=temp_dir)
# Walk through extracted files and upload them
for root, _, files in os.walk(temp_dir):
for extracted_file in files:
if os.path.join(root, extracted_file) == temp_file_path:
continue
rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir)
storage_path = f"{base_path}/{rel_path}"
with open(os.path.join(root, extracted_file), 'rb') as f:
storage.save_file(f, storage_path)
except Exception as e:
current_app.logger.error(
f"Error creating zip archive: {e}", exc_info=True
)
if os.path.exists(zip_output_path):
os.remove(zip_output_path)
raise
return zip_output_path
try:
zip_temp_path = create_zip_archive(temp_files, dir_name, storage)
with open(zip_temp_path, "rb") as zip_file:
storage.save_file(zip_file, zip_path)
task = ingest.delay(
settings.UPLOAD_FOLDER,
[
".rst",
".md",
".pdf",
".txt",
".docx",
".csv",
".epub",
".html",
".mdx",
".json",
".xlsx",
".pptx",
".png",
".jpg",
".jpeg",
],
job_name,
zip_filename,
user,
dir_name,
safe_user,
)
finally:
# Clean up temporary files
for temp_path in temp_files:
try:
storage.delete_file(temp_path)
except Exception as e:
current_app.logger.error(
f"Error deleting temporary file {temp_path}: {e}",
exc_info=True,
)
# Clean up the zip file if it was created
if zip_temp_path and os.path.exists(zip_temp_path):
os.remove(zip_temp_path)
else: # Keep this else block for single file upload
# For single file
file = files[0]
filename = safe_filename(file.filename)
file_path = f"{base_path}/{filename}"
storage.save_file(file, file_path)
task = ingest.delay(
settings.UPLOAD_FOLDER,
[
".rst",
".md",
".pdf",
".txt",
".docx",
".csv",
".epub",
".html",
".mdx",
".json",
".xlsx",
".pptx",
".png",
".jpg",
".jpeg",
],
job_name,
filename, # Corrected variable for single-file case
user,
dir_name,
safe_user,
)
current_app.logger.error(f"Error extracting zip: {e}", exc_info=True)
# If zip extraction fails, save the original zip file
file_path = f"{base_path}/{safe_file}"
with open(temp_file_path, 'rb') as f:
storage.save_file(f, file_path)
else:
# For non-zip files, save directly
file_path = f"{base_path}/{safe_file}"
with open(temp_file_path, 'rb') as f:
storage.save_file(f, file_path)
task = ingest.delay(
settings.UPLOAD_FOLDER,
[
".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub",
".html", ".mdx", ".json", ".xlsx", ".pptx", ".png",
".jpg", ".jpeg",
],
job_name,
user,
file_path=base_path,
filename=dir_name
)
except Exception as err:
current_app.logger.error(f"Error uploading file: {err}", exc_info=True)
return make_response(jsonify({"success": False}), 400)
return make_response(jsonify({"success": True, "task_id": task.id}), 200)
@user_ns.route("/api/manage_source_files")
class ManageSourceFiles(Resource):
@api.expect(
api.model(
"ManageSourceFilesModel",
{
"source_id": fields.String(required=True, description="Source ID to modify"),
"operation": fields.String(required=True, description="Operation: 'add', 'remove', or 'remove_directory'"),
"file_paths": fields.List(fields.String, required=False, description="File paths to remove (for remove operation)"),
"directory_path": fields.String(required=False, description="Directory path to remove (for remove_directory operation)"),
"file": fields.Raw(required=False, description="Files to add (for add operation)"),
"parent_dir": fields.String(required=False, description="Parent directory path relative to source root"),
},
)
)
@api.doc(
description="Add files, remove files, or remove directories from an existing source",
)
def post(self):
decoded_token = request.decoded_token
if not decoded_token:
return make_response(jsonify({"success": False, "message": "Unauthorized"}), 401)
user = decoded_token.get("sub")
source_id = request.form.get("source_id")
operation = request.form.get("operation")
if not source_id or not operation:
return make_response(
jsonify({"success": False, "message": "source_id and operation are required"}), 400
)
if operation not in ["add", "remove", "remove_directory"]:
return make_response(
jsonify({"success": False, "message": "operation must be 'add', 'remove', or 'remove_directory'"}), 400
)
try:
ObjectId(source_id)
except Exception:
return make_response(
jsonify({"success": False, "message": "Invalid source ID format"}), 400
)
try:
source = sources_collection.find_one({"_id": ObjectId(source_id), "user": user})
if not source:
return make_response(
jsonify({"success": False, "message": "Source not found or access denied"}), 404
)
except Exception as err:
current_app.logger.error(f"Error finding source: {err}", exc_info=True)
return make_response(jsonify({"success": False, "message": "Database error"}), 500)
try:
storage = StorageCreator.get_storage()
source_file_path = source.get("file_path", "")
parent_dir = request.form.get("parent_dir", "")
if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir):
return make_response(
jsonify({"success": False, "message": "Invalid parent directory path"}), 400
)
if operation == "add":
files = request.files.getlist("file")
if not files or all(file.filename == "" for file in files):
return make_response(
jsonify({"success": False, "message": "No files provided for add operation"}), 400
)
added_files = []
target_dir = source_file_path
if parent_dir:
target_dir = f"{source_file_path}/{parent_dir}"
for file in files:
if file.filename:
safe_filename_str = safe_filename(file.filename)
file_path = f"{target_dir}/{safe_filename_str}"
# Save file to storage
storage.save_file(file, file_path)
added_files.append(safe_filename_str)
# Trigger re-ingestion pipeline
from application.api.user.tasks import reingest_source_task
task = reingest_source_task.delay(source_id=source_id, user=user)
return make_response(jsonify({
"success": True,
"message": f"Added {len(added_files)} files",
"added_files": added_files,
"parent_dir": parent_dir,
"reingest_task_id": task.id
}), 200)
elif operation == "remove":
file_paths_str = request.form.get("file_paths")
if not file_paths_str:
return make_response(
jsonify({"success": False, "message": "file_paths required for remove operation"}), 400
)
try:
file_paths = json.loads(file_paths_str) if isinstance(file_paths_str, str) else file_paths_str
except Exception:
return make_response(
jsonify({"success": False, "message": "Invalid file_paths format"}), 400
)
# Remove files from storage and directory structure
removed_files = []
for file_path in file_paths:
full_path = f"{source_file_path}/{file_path}"
# Remove from storage
if storage.file_exists(full_path):
storage.delete_file(full_path)
removed_files.append(file_path)
# Trigger re-ingestion pipeline
from application.api.user.tasks import reingest_source_task
task = reingest_source_task.delay(source_id=source_id, user=user)
return make_response(jsonify({
"success": True,
"message": f"Removed {len(removed_files)} files",
"removed_files": removed_files,
"reingest_task_id": task.id
}), 200)
elif operation == "remove_directory":
directory_path = request.form.get("directory_path")
if not directory_path:
return make_response(
jsonify({"success": False, "message": "directory_path required for remove_directory operation"}), 400
)
# Validate directory path (prevent path traversal)
if directory_path.startswith("/") or ".." in directory_path:
current_app.logger.warning(
f"Invalid directory path attempted for removal. "
f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}"
)
return make_response(
jsonify({"success": False, "message": "Invalid directory path"}), 400
)
full_directory_path = f"{source_file_path}/{directory_path}" if directory_path else source_file_path
if not storage.is_directory(full_directory_path):
current_app.logger.warning(
f"Directory not found or is not a directory for removal. "
f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
f"Full path: {full_directory_path}"
)
return make_response(
jsonify({"success": False, "message": "Directory not found or is not a directory"}), 404
)
success = storage.remove_directory(full_directory_path)
if not success:
current_app.logger.error(
f"Failed to remove directory from storage. "
f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
f"Full path: {full_directory_path}"
)
return make_response(
jsonify({"success": False, "message": "Failed to remove directory"}), 500
)
current_app.logger.info(
f"Successfully removed directory. "
f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, "
f"Full path: {full_directory_path}"
)
# Trigger re-ingestion pipeline
from application.api.user.tasks import reingest_source_task
task = reingest_source_task.delay(source_id=source_id, user=user)
return make_response(jsonify({
"success": True,
"message": f"Successfully removed directory: {directory_path}",
"removed_directory": directory_path,
"reingest_task_id": task.id
}), 200)
except Exception as err:
error_context = f"operation={operation}, user={user}, source_id={source_id}"
if operation == "remove_directory":
directory_path = request.form.get("directory_path", "")
error_context += f", directory_path={directory_path}"
elif operation == "remove":
file_paths_str = request.form.get("file_paths", "")
error_context += f", file_paths={file_paths_str}"
elif operation == "add":
parent_dir = request.form.get("parent_dir", "")
error_context += f", parent_dir={parent_dir}"
current_app.logger.error(f"Error managing source files: {err} ({error_context})", exc_info=True)
return make_response(jsonify({"success": False, "message": "Operation failed"}), 500)
@user_ns.route("/api/remote")
class UploadRemote(Resource):
@api.expect(
@@ -834,6 +984,7 @@ class PaginatedSources(Resource):
"tokens": doc.get("tokens", ""),
"retriever": doc.get("retriever", "classic"),
"syncFrequency": doc.get("sync_frequency", ""),
"isNested": bool(doc.get("directory_structure"))
}
paginated_docs.append(doc_data)
response = {
@@ -881,6 +1032,7 @@ class CombinedJson(Resource):
"tokens": index.get("tokens", ""),
"retriever": index.get("retriever", "classic"),
"syncFrequency": index.get("sync_frequency", ""),
"is_nested": bool(index.get("directory_structure"))
}
)
except Exception as err:
@@ -3374,8 +3526,14 @@ class DeleteTool(Resource):
@user_ns.route("/api/get_chunks")
class GetChunks(Resource):
@api.doc(
description="Retrieves all chunks associated with a document",
params={"id": "The document ID"},
description="Retrieves chunks from a document, optionally filtered by file path and search term",
params={
"id": "The document ID",
"page": "Page number for pagination",
"per_page": "Number of chunks per page",
"path": "Optional: Filter chunks by relative file path",
"search": "Optional: Search term to filter chunks by title or content"
},
)
def get(self):
decoded_token = request.decoded_token
@@ -3385,6 +3543,8 @@ class GetChunks(Resource):
doc_id = request.args.get("id")
page = int(request.args.get("page", 1))
per_page = int(request.args.get("per_page", 10))
path = request.args.get("path")
search_term = request.args.get("search", "").strip().lower()
if not ObjectId.is_valid(doc_id):
return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3396,6 +3556,30 @@ class GetChunks(Resource):
try:
store = get_vector_store(doc_id)
chunks = store.get_chunks()
filtered_chunks = []
for chunk in chunks:
metadata = chunk.get("metadata", {})
# Filter by path if provided
if path:
chunk_source = metadata.get("source", "")
# Check if the chunk's source matches the requested path
if not chunk_source or not chunk_source.endswith(path):
continue
# Filter by search term if provided
if search_term:
text_match = search_term in chunk.get("text", "").lower()
title_match = search_term in metadata.get("title", "").lower()
if not (text_match or title_match):
continue
filtered_chunks.append(chunk)
chunks = filtered_chunks
total_chunks = len(chunks)
start = (page - 1) * per_page
end = start + per_page
@@ -3408,6 +3592,8 @@ class GetChunks(Resource):
"per_page": per_page,
"total": total_chunks,
"chunks": paginated_chunks,
"path": path if path else None,
"search": search_term if search_term else None
}
),
200,
@@ -3416,7 +3602,6 @@ class GetChunks(Resource):
current_app.logger.error(f"Error getting chunks: {e}", exc_info=True)
return make_response(jsonify({"success": False}), 500)
@user_ns.route("/api/add_chunk")
class AddChunk(Resource):
@api.expect(
@@ -3448,6 +3633,8 @@ class AddChunk(Resource):
doc_id = data.get("id")
text = data.get("text")
metadata = data.get("metadata", {})
token_count = num_tokens_from_string(text)
metadata["token_count"] = token_count
if not ObjectId.is_valid(doc_id):
return make_response(jsonify({"error": "Invalid doc_id"}), 400)
@@ -3544,6 +3731,12 @@ class UpdateChunk(Resource):
text = data.get("text")
metadata = data.get("metadata")
if text is not None:
token_count = num_tokens_from_string(text)
if metadata is None:
metadata = {}
metadata["token_count"] = token_count
if not ObjectId.is_valid(doc_id):
return make_response(jsonify({"error": "Invalid doc_id"}), 400)
doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
@@ -3553,31 +3746,45 @@ class UpdateChunk(Resource):
)
try:
store = get_vector_store(doc_id)
chunks = store.get_chunks()
existing_chunk = next((c for c in chunks if c["doc_id"] == chunk_id), None)
if not existing_chunk:
return make_response(jsonify({"error": "Chunk not found"}), 404)
deleted = store.delete_chunk(chunk_id)
if not deleted:
return make_response(
jsonify({"error": "Failed to delete existing chunk"}), 500
)
new_text = text if text is not None else existing_chunk["text"]
new_metadata = (
metadata if metadata is not None else existing_chunk["metadata"]
)
new_chunk_id = store.add_chunk(new_text, new_metadata)
if metadata is not None:
new_metadata = existing_chunk["metadata"].copy()
new_metadata.update(metadata)
else:
new_metadata = existing_chunk["metadata"].copy()
return make_response(
jsonify(
{
"message": "Chunk updated successfully",
"new_chunk_id": new_chunk_id,
}
),
200,
)
if text is not None:
new_metadata["token_count"] = num_tokens_from_string(new_text)
try:
new_chunk_id = store.add_chunk(new_text, new_metadata)
deleted = store.delete_chunk(chunk_id)
if not deleted:
current_app.logger.warning(f"Failed to delete old chunk {chunk_id}, but new chunk {new_chunk_id} was created")
return make_response(
jsonify(
{
"message": "Chunk updated successfully",
"chunk_id": new_chunk_id,
"original_chunk_id": chunk_id,
}
),
200,
)
except Exception as add_error:
current_app.logger.error(f"Failed to add updated chunk: {add_error}")
return make_response(
jsonify({"error": "Failed to update chunk - addition failed"}), 500
)
except Exception as e:
current_app.logger.error(f"Error updating chunk: {e}", exc_info=True)
return make_response(jsonify({"success": False}), 500)
@@ -3681,3 +3888,51 @@ class ServeImage(Resource):
return make_response(
jsonify({"success": False, "message": "Error retrieving image"}), 500
)
@user_ns.route("/api/directory_structure")
class DirectoryStructure(Resource):
@api.doc(
description="Get the directory structure for a document",
params={"id": "The document ID"},
)
def get(self):
decoded_token = request.decoded_token
if not decoded_token:
return make_response(jsonify({"success": False}), 401)
user = decoded_token.get("sub")
doc_id = request.args.get("id")
if not doc_id:
return make_response(
jsonify({"error": "Document ID is required"}), 400
)
if not ObjectId.is_valid(doc_id):
return make_response(jsonify({"error": "Invalid document ID"}), 400)
try:
doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user})
if not doc:
return make_response(
jsonify({"error": "Document not found or access denied"}), 404
)
directory_structure = doc.get("directory_structure", {})
return make_response(
jsonify({
"success": True,
"directory_structure": directory_structure,
"base_path": doc.get("file_path", "")
}), 200
)
except Exception as e:
current_app.logger.error(
f"Error retrieving directory structure: {e}", exc_info=True
)
return make_response(
jsonify({"success": False, "error": str(e)}), 500
)