import logging import os import shutil import string import zipfile from collections import Counter from urllib.parse import urljoin import requests from bson.objectid import ObjectId from pymongo import MongoClient from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader from application.parser.open_ai_func import call_openai_api from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document from application.parser.token_func import group_split from application.utils import count_tokens_docs mongo = MongoClient(settings.MONGO_URI) db = mongo["docsgpt"] sources_collection = db["sources"] # Define a function to extract metadata from a given filename. def metadata_from_filename(title): return {"title": title} # Define a function to generate a random string of a given length. def generate_random_string(length): return "".join([string.ascii_letters[i % 52] for i in range(length)]) current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): """ Recursively extract zip files with a limit on recursion depth. Args: zip_path (str): Path to the zip file to be extracted. extract_to (str): Destination path for extracted files. current_depth (int): Current depth of recursion. max_depth (int): Maximum allowed depth of recursion to prevent infinite loops. """ if current_depth > max_depth: logging.warning(f"Reached maximum recursion depth of {max_depth}") return with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extract_to) os.remove(zip_path) # Remove the zip file after extracting # Check for nested zip files and extract them for root, dirs, files in os.walk(extract_to): for file in files: if file.endswith(".zip"): # If a nested zip file is found, extract it recursively file_path = os.path.join(root, file) extract_zip_recursive(file_path, root, current_depth + 1, max_depth) # Define the main function for ingesting and processing documents. def ingest_worker( self, directory, formats, name_job, filename, user, retriever="classic" ): """ Ingest and process documents. Args: self: Reference to the instance of the task. directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). name_job (str): Name of the job for this ingestion task. filename (str): Name of the file to be ingested. user (str): Identifier for the user initiating the ingestion. retriever (str): Type of retriever to use for processing the documents. Returns: dict: Information about the completed ingestion task, including input parameters and a "limited" flag. """ # directory = 'inputs' or 'temp' # formats = [".rst", ".md"] input_files = None recursive = True limit = None exclude = True # name_job = 'job1' # filename = 'install.rst' # user = 'local' sample = False token_check = True min_tokens = 150 max_tokens = 1250 recursion_depth = 2 full_path = os.path.join(directory, user, name_job) logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) # check if API_URL env variable is set file_data = {"name": name_job, "file": filename, "user": user} response = requests.get( urljoin(settings.API_URL, "/api/download"), params=file_data ) file = response.content if not os.path.exists(full_path): os.makedirs(full_path) with open(os.path.join(full_path, filename), "wb") as f: f.write(file) # check if file is .zip and extract it if filename.endswith(".zip"): extract_zip_recursive( os.path.join(full_path, filename), full_path, 0, recursion_depth ) self.update_state(state="PROGRESS", meta={"current": 1}) raw_docs = SimpleDirectoryReader( input_dir=full_path, input_files=input_files, recursive=recursive, required_exts=formats, num_files_limit=limit, exclude_hidden=exclude, file_metadata=metadata_from_filename, ).load_data() raw_docs = group_split( documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check, ) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] id = ObjectId() call_openai_api(docs, full_path, id, self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) if sample: for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) file_data = { "name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": "local", } if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), "file_pkl": open(full_path + "/index.pkl", "rb"), } response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data ) else: response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), data=file_data ) # delete local shutil.rmtree(full_path) return { "directory": directory, "formats": formats, "name_job": name_job, "filename": filename, "user": user, "limited": False, } def remote_worker( self, source_data, name_job, user, loader, directory="temp", retriever="classic", sync_frequency="never", operation_mode="upload", doc_id=None, ): token_check = True min_tokens = 150 max_tokens = 1250 full_path = directory + "/" + user + "/" + name_job if not os.path.exists(full_path): os.makedirs(full_path) self.update_state(state="PROGRESS", meta={"current": 1}) logging.info( f"Remote job: {full_path}", extra={"user": user, "job": name_job, source_data: source_data}, ) remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) docs = group_split( documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check, ) # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] tokens = count_tokens_docs(docs) if operation_mode == "upload": id = ObjectId() call_openai_api(docs, full_path, id, self) elif operation_mode == "sync": if not doc_id or not ObjectId.is_valid(doc_id): raise ValueError("doc_id must be provided for sync operation.") id = ObjectId(doc_id) call_openai_api(docs, full_path, id, self) self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function file_data = { "name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": loader, "remote_data": source_data, "sync_frequency": sync_frequency, } if settings.VECTOR_STORE == "faiss": files = { "file_faiss": open(full_path + "/index.faiss", "rb"), "file_pkl": open(full_path + "/index.pkl", "rb"), } requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data ) else: requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) shutil.rmtree(full_path) return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} def sync( self, source_data, name_job, user, loader, sync_frequency, retriever, doc_id=None, directory="temp", ): try: remote_worker( self, source_data, name_job, user, loader, directory, retriever, sync_frequency, "sync", doc_id, ) except Exception as e: return {"status": "error", "error": str(e)} return {"status": "success"} def sync_worker(self, frequency): sync_counts = Counter() sources = sources_collection.find() for doc in sources: if doc.get("sync_frequency") == frequency: name = doc.get("name") user = doc.get("user") source_type = doc.get("type") source_data = doc.get("remote_data") retriever = doc.get("retriever") doc_id = str(doc.get("_id")) resp = sync( self, source_data, name, user, source_type, frequency, retriever, doc_id ) sync_counts["total_sync_count"] += 1 sync_counts[ "sync_success" if resp["status"] == "success" else "sync_failure" ] += 1 return { key: sync_counts[key] for key in ["total_sync_count", "sync_success", "sync_failure"] }