Merge pull request #917 from arc53/multiple-uploads

Multiple file upload
This commit is contained in:
Alex
2024-04-09 18:13:52 +01:00
committed by GitHub
3 changed files with 67 additions and 27 deletions

View File

@@ -1,5 +1,6 @@
import os import os
import uuid import uuid
import shutil
from flask import Blueprint, request, jsonify from flask import Blueprint, request, jsonify
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@@ -136,30 +137,43 @@ def upload_file():
return {"status": "no name"} return {"status": "no name"}
job_name = secure_filename(request.form["name"]) job_name = secure_filename(request.form["name"])
# check if the post request has the file part # check if the post request has the file part
if "file" not in request.files: files = request.files.getlist("file")
print("No file part")
return {"status": "no file"} if not files or all(file.filename == '' for file in files):
file = request.files["file"]
if file.filename == "":
return {"status": "no file name"} return {"status": "no file name"}
if file: # Directory where files will be saved
filename = secure_filename(file.filename) save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
# save dir os.makedirs(save_dir, exist_ok=True)
save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
# create dir if not exists if len(files) > 1:
if not os.path.exists(save_dir): # Multiple files; prepare them for zip
os.makedirs(save_dir) temp_dir = os.path.join(save_dir, "temp")
os.makedirs(temp_dir, exist_ok=True)
file.save(os.path.join(save_dir, filename))
task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx", for file in files:
".csv", ".epub", ".html", ".mdx"], filename = secure_filename(file.filename)
job_name, filename, user) file.save(os.path.join(temp_dir, filename))
# task id
task_id = task.id # Use shutil.make_archive to zip the temp directory
return {"status": "ok", "task_id": task_id} zip_path = shutil.make_archive(base_name=os.path.join(save_dir, job_name), format='zip', root_dir=temp_dir)
final_filename = os.path.basename(zip_path)
# Clean up the temporary directory after zipping
shutil.rmtree(temp_dir)
else: else:
return {"status": "error"} # Single file
file = files[0]
final_filename = secure_filename(file.filename)
file_path = os.path.join(save_dir, final_filename)
file.save(file_path)
# Call ingest with the single file or zipped file
task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx",
".csv", ".epub", ".html", ".mdx"],
job_name, final_filename, user)
return {"status": "ok", "task_id": task.id}
@user.route("/api/remote", methods=["POST"]) @user.route("/api/remote", methods=["POST"])
def upload_remote(): def upload_remote():

View File

@@ -36,6 +36,32 @@ current_dir = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
) )
def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
"""
Recursively extract zip files with a limit on recursion depth.
Args:
zip_path (str): Path to the zip file to be extracted.
extract_to (str): Destination path for extracted files.
current_depth (int): Current depth of recursion.
max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
"""
if current_depth > max_depth:
print(f"Reached maximum recursion depth of {max_depth}")
return
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
os.remove(zip_path) # Remove the zip file after extracting
# Check for nested zip files and extract them
for root, dirs, files in os.walk(extract_to):
for file in files:
if file.endswith(".zip"):
# If a nested zip file is found, extract it recursively
file_path = os.path.join(root, file)
extract_zip_recursive(file_path, root, current_depth + 1, max_depth)
# Define the main function for ingesting and processing documents. # Define the main function for ingesting and processing documents.
def ingest_worker(self, directory, formats, name_job, filename, user): def ingest_worker(self, directory, formats, name_job, filename, user):
@@ -66,9 +92,11 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
token_check = True token_check = True
min_tokens = 150 min_tokens = 150
max_tokens = 1250 max_tokens = 1250
full_path = directory + "/" + user + "/" + name_job recursion_depth = 2
full_path = os.path.join(directory, user, name_job)
import sys import sys
print(full_path, file=sys.stderr) print(full_path, file=sys.stderr)
# check if API_URL env variable is set # check if API_URL env variable is set
file_data = {"name": name_job, "file": filename, "user": user} file_data = {"name": name_job, "file": filename, "user": user}
@@ -81,14 +109,12 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
if not os.path.exists(full_path): if not os.path.exists(full_path):
os.makedirs(full_path) os.makedirs(full_path)
with open(full_path + "/" + filename, "wb") as f: with open(os.path.join(full_path, filename), "wb") as f:
f.write(file) f.write(file)
# check if file is .zip and extract it # check if file is .zip and extract it
if filename.endswith(".zip"): if filename.endswith(".zip"):
with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref: extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth)
zip_ref.extractall(full_path)
os.remove(full_path + "/" + filename)
self.update_state(state="PROGRESS", meta={"current": 1}) self.update_state(state="PROGRESS", meta={"current": 1})

View File

@@ -201,7 +201,7 @@ export default function Upload({
const { getRootProps, getInputProps, isDragActive } = useDropzone({ const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop, onDrop,
multiple: false, multiple: true,
onDragEnter: doNothing, onDragEnter: doNothing,
onDragOver: doNothing, onDragOver: doNothing,
onDragLeave: doNothing, onDragLeave: doNothing,