mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
feat: new vectors structure
This commit is contained in:
@@ -35,12 +35,12 @@ def upload_index_files():
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
tokens = secure_filename(request.form["tokens"])
|
||||
""""
|
||||
ObjectId serves as a dir name in application/indexes,
|
||||
and for indexing the vector metadata in the collection
|
||||
"""
|
||||
_id = ObjectId()
|
||||
save_dir = os.path.join(current_dir, "indexes", str(_id))
|
||||
retriever = secure_filename(request.form["retriever"])
|
||||
id = secure_filename(request.form["id"])
|
||||
type = secure_filename(request.form["type"])
|
||||
remote_data = secure_filename(request.form["remote_data"]) if "remote_data" in request.form else None
|
||||
|
||||
save_dir = os.path.join(current_dir, "indexes", str(id))
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
if "file_faiss" not in request.files:
|
||||
print("No file part")
|
||||
@@ -63,15 +63,16 @@ def upload_index_files():
|
||||
# create entry in vectors_collection
|
||||
vectors_collection.insert_one(
|
||||
{
|
||||
"_id":_id,
|
||||
"_id": ObjectId(id),
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local",
|
||||
"tokens": tokens
|
||||
"type": type,
|
||||
"tokens": tokens,
|
||||
"retriever": retriever,
|
||||
"remote_data": remote_data
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
||||
@@ -237,15 +237,11 @@ def combined_json():
|
||||
data = [
|
||||
{
|
||||
"name": "default",
|
||||
"language": "default",
|
||||
"version": "",
|
||||
"description": "default",
|
||||
"fullName": "default",
|
||||
"date": "default",
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "remote",
|
||||
"tokens": "",
|
||||
"retriever": "classic",
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
@@ -255,35 +251,22 @@ def combined_json():
|
||||
{
|
||||
"id": str(index["_id"]),
|
||||
"name": index["name"],
|
||||
"language": index["language"],
|
||||
"version": "",
|
||||
"description": index["name"],
|
||||
"fullName": index["name"],
|
||||
"date": index["date"],
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
"tokens": index["tokens"] if ("tokens" in index.keys()) else "",
|
||||
"retriever": index["retriever"] if ("retriever" in index.keys()) else "classic",
|
||||
}
|
||||
)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
|
||||
for index in data_remote:
|
||||
index["location"] = "remote"
|
||||
data.append(index)
|
||||
if "duckduck_search" in settings.RETRIEVERS_ENABLED:
|
||||
data.append(
|
||||
{
|
||||
"name": "DuckDuckGo Search",
|
||||
"language": "en",
|
||||
"version": "",
|
||||
"description": "duckduck_search",
|
||||
"fullName": "DuckDuckGo Search",
|
||||
"date": "duckduck_search",
|
||||
"docLink": "duckduck_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens": "",
|
||||
"retriever": "duckduck_search",
|
||||
}
|
||||
)
|
||||
if "brave_search" in settings.RETRIEVERS_ENABLED:
|
||||
@@ -291,14 +274,11 @@ def combined_json():
|
||||
{
|
||||
"name": "Brave Search",
|
||||
"language": "en",
|
||||
"version": "",
|
||||
"description": "brave_search",
|
||||
"fullName": "Brave Search",
|
||||
"date": "brave_search",
|
||||
"docLink": "brave_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens": "",
|
||||
"retriever": "brave_search",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ class ClassicRAG(BaseRetriever):
|
||||
user_api_key=None,
|
||||
):
|
||||
self.question = question
|
||||
self.vectorstore = self._get_vectorstore(source=source)
|
||||
self.vectorstore = source['active_docs'] if 'active_docs' in source else None
|
||||
self.chat_history = chat_history
|
||||
self.prompt = prompt
|
||||
self.chunks = chunks
|
||||
@@ -38,14 +38,6 @@ class ClassicRAG(BaseRetriever):
|
||||
)
|
||||
self.user_api_key = user_api_key
|
||||
|
||||
def _get_vectorstore(self, source):
|
||||
if "active_docs" in source:
|
||||
vectorstore = "indexes/"+source["active_docs"]
|
||||
else:
|
||||
vectorstore = ""
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
return vectorstore
|
||||
|
||||
def _get_data(self):
|
||||
if self.chunks == 0:
|
||||
docs = []
|
||||
|
||||
@@ -210,4 +210,3 @@ class ElasticsearchStore(BaseVectorStore):
|
||||
def delete_index(self):
|
||||
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
|
||||
"metadata.store.keyword": self.path}},)
|
||||
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from application.vectorstore.base import BaseVectorStore
|
||||
from application.core.settings import settings
|
||||
import os
|
||||
|
||||
def get_vectorstore(path):
|
||||
if path:
|
||||
vectorstore = "indexes/"+path
|
||||
vectorstore = os.path.join("application", vectorstore)
|
||||
else:
|
||||
vectorstore = os.path.join("application")
|
||||
|
||||
return vectorstore
|
||||
|
||||
class FaissStore(BaseVectorStore):
|
||||
|
||||
def __init__(self, path, embeddings_key, docs_init=None):
|
||||
super().__init__()
|
||||
self.path = path
|
||||
self.path = get_vectorstore(path)
|
||||
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
|
||||
if docs_init:
|
||||
self.docsearch = FAISS.from_documents(
|
||||
|
||||
@@ -6,6 +6,7 @@ import tiktoken
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.parser.file.bulk import SimpleDirectoryReader
|
||||
@@ -57,7 +58,7 @@ def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
|
||||
|
||||
|
||||
# Define the main function for ingesting and processing documents.
|
||||
def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
def ingest_worker(self, directory, formats, name_job, filename, user, retriever="classic"):
|
||||
"""
|
||||
Ingest and process documents.
|
||||
|
||||
@@ -68,6 +69,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
name_job (str): Name of the job for this ingestion task.
|
||||
filename (str): Name of the file to be ingested.
|
||||
user (str): Identifier for the user initiating the ingestion.
|
||||
retriever (str): Type of retriever to use for processing the documents.
|
||||
|
||||
Returns:
|
||||
dict: Information about the completed ingestion task, including input parameters and a "limited" flag.
|
||||
@@ -136,7 +138,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens}
|
||||
id = ObjectId()
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), 'type': 'local'}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
@@ -160,7 +163,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
}
|
||||
|
||||
|
||||
def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
def remote_worker(self, source_data, name_job, user, loader, directory="temp", retriever="classic"):
|
||||
token_check = True
|
||||
min_tokens = 150
|
||||
max_tokens = 1250
|
||||
@@ -180,12 +183,14 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
token_check=token_check,
|
||||
)
|
||||
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
call_openai_api(docs, full_path, self)
|
||||
tokens = count_tokens_docs(docs)
|
||||
call_openai_api(docs, full_path, self)
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
# Proceed with uploading and cleaning as in the original function
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens}
|
||||
id = ObjectId()
|
||||
file_data = {"name": name_job, "user": user, "tokens": tokens, "retriever": retriever,
|
||||
"id": str(id), 'type': loader, 'remote_data': source_data}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
|
||||
@@ -124,10 +124,8 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
|
||||
};
|
||||
|
||||
const handleDeleteClick = (doc: Doc) => {
|
||||
const docPath = `indexes/local/${doc.name}`;
|
||||
|
||||
userService
|
||||
.deletePath(docPath)
|
||||
.deletePath(doc.id ?? '')
|
||||
.then(() => {
|
||||
return getDocs();
|
||||
})
|
||||
|
||||
@@ -63,9 +63,6 @@ function SourceDropdown({
|
||||
<p className="max-w-3/4 truncate whitespace-nowrap">
|
||||
{selectedDocs?.name || 'None'}
|
||||
</p>
|
||||
<p className="flex flex-col items-center justify-center">
|
||||
{selectedDocs?.version}
|
||||
</p>
|
||||
</div>
|
||||
</span>
|
||||
<img
|
||||
|
||||
@@ -41,7 +41,7 @@ export function handleFetchAnswer(
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
else payload.retriever = selectedDocs?.docLink as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
return conversationService
|
||||
.answer(payload, signal)
|
||||
.then((response) => {
|
||||
@@ -87,7 +87,7 @@ export function handleFetchAnswerSteaming(
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
else payload.retriever = selectedDocs?.docLink as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
|
||||
return new Promise<Answer>((resolve, reject) => {
|
||||
conversationService
|
||||
@@ -160,7 +160,7 @@ export function handleSearch(
|
||||
};
|
||||
if (selectedDocs && 'id' in selectedDocs)
|
||||
payload.active_docs = selectedDocs.id as string;
|
||||
else payload.retriever = selectedDocs?.docLink as string;
|
||||
payload.retriever = selectedDocs?.retriever as string;
|
||||
return conversationService
|
||||
.search(payload)
|
||||
.then((response) => response.json())
|
||||
|
||||
@@ -46,27 +46,9 @@ export const ShareConversationModal = ({
|
||||
? docs
|
||||
.filter((doc) => doc.model === embeddingsName)
|
||||
.map((doc: Doc) => {
|
||||
let namePath = doc.name;
|
||||
if (doc.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
let docPath = 'default';
|
||||
if (doc.location === 'local') {
|
||||
docPath = 'local' + '/' + doc.name + '/';
|
||||
} else if (doc.location === 'remote') {
|
||||
docPath =
|
||||
doc.language +
|
||||
'/' +
|
||||
namePath +
|
||||
'/' +
|
||||
doc.version +
|
||||
'/' +
|
||||
doc.model +
|
||||
'/';
|
||||
}
|
||||
return {
|
||||
label: doc.name,
|
||||
value: docPath,
|
||||
value: doc.id ?? 'default',
|
||||
};
|
||||
})
|
||||
: [];
|
||||
|
||||
@@ -3,15 +3,12 @@ import userService from '../api/services/userService';
|
||||
|
||||
// not all properties in Doc are going to be present. Make some optional
|
||||
export type Doc = {
|
||||
location: string;
|
||||
id: string | null;
|
||||
name: string;
|
||||
language: string;
|
||||
version: string;
|
||||
description: string;
|
||||
fullName: string;
|
||||
type: string;
|
||||
date: string;
|
||||
docLink: string;
|
||||
model: string;
|
||||
retriever: string;
|
||||
};
|
||||
|
||||
//Fetches all JSON objects from the source. We only use the objects with the "model" property in SelectDocsModal.tsx. Hopefully can clean up the source file later.
|
||||
@@ -78,17 +75,10 @@ export function setLocalPrompt(prompt: string): void {
|
||||
|
||||
export function setLocalRecentDocs(doc: Doc): void {
|
||||
localStorage.setItem('DocsGPTRecentDocs', JSON.stringify(doc));
|
||||
let namePath = doc.name;
|
||||
if (doc.language === namePath) {
|
||||
namePath = '.project';
|
||||
}
|
||||
|
||||
let docPath = 'default';
|
||||
if (doc.location === 'local') {
|
||||
if (doc.type === 'local') {
|
||||
docPath = 'local' + '/' + doc.name + '/';
|
||||
} else if (doc.location === 'remote') {
|
||||
docPath =
|
||||
doc.language + '/' + namePath + '/' + doc.version + '/' + doc.model + '/';
|
||||
}
|
||||
userService
|
||||
.checkDocs({
|
||||
|
||||
@@ -25,15 +25,13 @@ const initialState: Preference = {
|
||||
chunks: '2',
|
||||
token_limit: 2000,
|
||||
selectedDocs: {
|
||||
id: 'default',
|
||||
name: 'default',
|
||||
language: 'default',
|
||||
location: 'default',
|
||||
version: 'default',
|
||||
description: 'default',
|
||||
fullName: 'default',
|
||||
type: 'remote',
|
||||
date: 'default',
|
||||
docLink: 'default',
|
||||
model: 'openai_text-embedding-ada-002',
|
||||
retriever: 'classic',
|
||||
} as Doc,
|
||||
sourceDocs: null,
|
||||
conversations: null,
|
||||
|
||||
@@ -35,9 +35,8 @@ export default function Settings() {
|
||||
};
|
||||
|
||||
const handleDeleteClick = (index: number, doc: Doc) => {
|
||||
const docPath = 'indexes/' + 'local' + '/' + doc.name;
|
||||
userService
|
||||
.deletePath(docPath)
|
||||
.deletePath(doc.id ?? '')
|
||||
.then((response) => {
|
||||
if (response.ok && documents) {
|
||||
const updatedDocuments = [
|
||||
|
||||
35
scripts/migrate_to_v1_vectorstore.py
Normal file
35
scripts/migrate_to_v1_vectorstore.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import pymongo
|
||||
import os
|
||||
|
||||
def migrate_to_v1_vectorstore_mongo():
|
||||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||||
db = client["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
for vector in vectors_collection.find():
|
||||
if "location" in vector:
|
||||
del vector["location"]
|
||||
if "retriever" not in vector:
|
||||
vector["retriever"] = "classic"
|
||||
vector["remote_data"] = None
|
||||
vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector})
|
||||
|
||||
client.close()
|
||||
|
||||
def migrate_faiss_to_v1_vectorstore():
|
||||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||||
db = client["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
for vector in vectors_collection.find():
|
||||
old_path = f"./application/indexes/{vector['user']}/{vector['name']}"
|
||||
new_path = f"./application/indexes/{vector['_id']}"
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
except OSError as e:
|
||||
print(f"Error moving {old_path} to {new_path}: {e}")
|
||||
|
||||
client.close()
|
||||
|
||||
migrate_faiss_to_v1_vectorstore()
|
||||
migrate_to_v1_vectorstore_mongo()
|
||||
Reference in New Issue
Block a user