feat: sync remote sources through celery periodic tasks

This commit is contained in:
Siddhant Rai
2024-09-25 15:20:11 +05:30
parent f92658de82
commit 3d292aa485
13 changed files with 404 additions and 88 deletions

View File

@@ -6,15 +6,20 @@ from werkzeug.utils import secure_filename
from bson.objectid import ObjectId
from application.core.settings import settings
mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
sources_collection = db["sources"]
current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
current_dir = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
internal = Blueprint("internal", __name__)
internal = Blueprint('internal', __name__)
@internal.route("/api/download", methods=["get"])
def download_file():
user = secure_filename(request.args.get("user"))
@@ -24,7 +29,6 @@ def download_file():
return send_from_directory(save_dir, filename, as_attachment=True)
@internal.route("/api/upload_index", methods=["POST"])
def upload_index_files():
"""Upload two files(index.faiss, index.pkl) to the user's folder."""
@@ -38,7 +42,8 @@ def upload_index_files():
retriever = secure_filename(request.form["retriever"])
id = secure_filename(request.form["id"])
type = secure_filename(request.form["type"])
remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in request.form else None
remote_data = request.form["remote_data"] if "remote_data" in request.form else None
sync_frequency = secure_filename(request.form["sync_frequency"])
save_dir = os.path.join(current_dir, "indexes", str(id))
if settings.VECTOR_STORE == "faiss":
@@ -55,24 +60,45 @@ def upload_index_files():
if file_pkl.filename == "":
return {"status": "no file name"}
# saves index files
if not os.path.exists(save_dir):
os.makedirs(save_dir)
file_faiss.save(os.path.join(save_dir, "index.faiss"))
file_pkl.save(os.path.join(save_dir, "index.pkl"))
# create entry in sources_collection
sources_collection.insert_one(
{
"_id": ObjectId(id),
"user": user,
"name": job_name,
"language": job_name,
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": type,
"tokens": tokens,
"retriever": retriever,
"remote_data": remote_data
}
)
return {"status": "ok"}
existing_entry = sources_collection.find_one({"_id": ObjectId(id)})
if existing_entry:
sources_collection.update_one(
{"_id": ObjectId(id)},
{
"$set": {
"user": user,
"name": job_name,
"language": job_name,
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": type,
"tokens": tokens,
"retriever": retriever,
"remote_data": remote_data,
"sync_frequency": sync_frequency,
}
},
)
else:
sources_collection.insert_one(
{
"_id": ObjectId(id),
"user": user,
"name": job_name,
"language": job_name,
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": type,
"tokens": tokens,
"retriever": retriever,
"remote_data": remote_data,
"sync_frequency": sync_frequency,
}
)
return {"status": "ok"}

View File

@@ -289,14 +289,13 @@ def combined_json():
data.append(
{
"id": str(index["_id"]),
"name": index["name"],
"date": index["date"],
"name": index.get("name"),
"date": index.get("date"),
"model": settings.EMBEDDINGS_NAME,
"location": "local",
"tokens": index["tokens"] if ("tokens" in index.keys()) else "",
"retriever": (
index["retriever"] if ("retriever" in index.keys()) else "classic"
),
"tokens": index.get("tokens", ""),
"retriever": index.get("retriever", "classic"),
"syncFrequency": index.get("sync_frequency", ""),
}
)
if "duckduck_search" in settings.RETRIEVERS_ENABLED:
@@ -1157,3 +1156,27 @@ def get_user_logs():
),
200,
)
@user.route("/api/manage_sync", methods=["POST"])
def manage_sync():
data = request.get_json()
source_id = data.get("source_id")
sync_frequency = data.get("sync_frequency")
if sync_frequency not in ["never", "daily", "weekly", "monthly"]:
return jsonify({"status": "invalid frequency"}), 400
update_data = {"$set": {"sync_frequency": sync_frequency}}
try:
sources_collection.update_one(
{
"_id": ObjectId(source_id),
"user": "local",
},
update_data,
)
except Exception as err:
print(err)
return jsonify({"status": "error"}), 400
return jsonify({"status": "ok"}), 200

View File

@@ -1,12 +1,38 @@
from application.worker import ingest_worker, remote_worker
from datetime import timedelta
from application.celery_init import celery
from application.worker import ingest_worker, remote_worker, sync_worker
@celery.task(bind=True)
def ingest(self, directory, formats, name_job, filename, user):
resp = ingest_worker(self, directory, formats, name_job, filename, user)
return resp
@celery.task(bind=True)
def ingest_remote(self, source_data, job_name, user, loader):
resp = remote_worker(self, source_data, job_name, user, loader)
return resp
@celery.task(bind=True)
def schedule_syncs(self, frequency):
resp = sync_worker(self, frequency)
return resp
@celery.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(
timedelta(days=1),
schedule_syncs.s("daily"),
)
sender.add_periodic_task(
timedelta(weeks=1),
schedule_syncs.s("weekly"),
)
sender.add_periodic_task(
timedelta(days=30),
schedule_syncs.s("monthly"),
)