Merge remote-tracking branch 'origin/main' into issues/1226

This commit is contained in:
shatanikmahanty
2024-10-06 12:26:05 +05:30
18 changed files with 219 additions and 75 deletions

View File

@@ -7,7 +7,7 @@ All contributors with accepted PRs will receive a cool Holopin! 🤩 (Watch out
### 🏆 Top 50 contributors will recieve a special T-shirt
### 🏆 [LLM Document analysis by LexEU competition](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md):
A separate competition is available for those who sumbit new retrieval / workflow method that will analyze a Document using EU laws.
A separate competition is available for those who submit new retrieval / workflow method that will analyze a Document using EU laws.
With 200$, 100$, 50$ prize for 1st, 2nd and 3rd place respectively.
You can find more information [here](https://github.com/arc53/DocsGPT/blob/main/lexeu-competition.md)

View File

@@ -363,6 +363,7 @@ class UploadRemote(Resource):
),
"name": fields.String(required=True, description="Job name"),
"data": fields.String(required=True, description="Data to process"),
"repo_url": fields.String(description="GitHub repository URL"),
},
)
)
@@ -377,11 +378,18 @@ class UploadRemote(Resource):
return missing_fields
try:
if "repo_url" in data:
source_data = data["repo_url"]
loader = "github"
else:
source_data = data["data"]
loader = data["source"]
task = ingest_remote.delay(
source_data=data["data"],
source_data=source_data,
job_name=data["name"],
user=data["user"],
loader=data["source"],
loader=loader,
)
except Exception as err:
return make_response(jsonify({"success": False, "error": str(err)}), 400)

View File

@@ -0,0 +1,53 @@
import base64
import requests
from typing import List
from application.parser.remote.base import BaseRemote
from langchain_core.documents import Document
class GitHubLoader(BaseRemote):
def __init__(self):
self.access_token = None
self.headers = {
"Authorization": f"token {self.access_token}"
} if self.access_token else {}
return
def fetch_file_content(self, repo_url: str, file_path: str) -> str:
url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}"
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
content = response.json()
if content.get("encoding") == "base64":
try:
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
return f"Filename: {file_path}\n\n{decoded_content}"
except Exception as e:
print(f"Error decoding content for {file_path}: {e}")
raise
else:
return f"Filename: {file_path}\n\n{content['content']}"
else:
response.raise_for_status()
def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]:
url = f"https://api.github.com/repos/{repo_url}/contents/{path}"
response = requests.get(url, headers={**self.headers, "Accept": "application/vnd.github.v3.raw"})
contents = response.json()
files = []
for item in contents:
if item["type"] == "file":
files.append(item["path"])
elif item["type"] == "dir":
files.extend(self.fetch_repo_files(repo_url, item["path"]))
return files
def load_data(self, repo_url: str) -> List[Document]:
repo_name = repo_url.split("github.com/")[-1]
files = self.fetch_repo_files(repo_name)
documents = []
for file_path in files:
content = self.fetch_file_content(repo_name, file_path)
documents.append(Document(page_content=content, metadata={"title": file_path,
"source": f"https://github.com/{repo_name}/blob/main/{file_path}"}))
return documents

View File

@@ -2,6 +2,7 @@ from application.parser.remote.sitemap_loader import SitemapLoader
from application.parser.remote.crawler_loader import CrawlerLoader
from application.parser.remote.web_loader import WebLoader
from application.parser.remote.reddit_loader import RedditPostsLoaderRemote
from application.parser.remote.github_loader import GitHubLoader
class RemoteCreator:
@@ -10,6 +11,7 @@ class RemoteCreator:
"sitemap": SitemapLoader,
"crawler": CrawlerLoader,
"reddit": RedditPostsLoaderRemote,
"github": GitHubLoader,
}
@classmethod

View File

@@ -37,7 +37,7 @@ export default function Hero({
<Fragment key={key}>
<button
onClick={() => handleQuestion({ question: demo.query })}
className="w-full rounded-full border-2 border-silver px-6 py-4 text-left hover:border-gray-4000 dark:hover:border-gray-3000 xl:min-w-[24vw]"
className="w-full rounded-full border border-silver px-6 py-4 text-left hover:border-gray-4000 dark:hover:border-gray-3000 xl:min-w-[24vw]"
>
<p className="mb-1 font-semibold text-black dark:text-silver">
{demo.header}

View File

@@ -119,6 +119,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
.delete(id, {})
.then(() => {
fetchConversations();
resetConversation();
})
.catch((error) => console.error(error));
};
@@ -155,6 +156,15 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
});
};
const resetConversation = () => {
dispatch(setConversation([]));
dispatch(
updateConversationId({
query: { conversationId: null },
}),
);
};
async function updateConversationName(updatedConversation: {
name: string;
id: string;
@@ -240,12 +250,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
if (isMobile) {
setNavOpen(!navOpen);
}
dispatch(setConversation([]));
dispatch(
updateConversationId({
query: { conversationId: null },
}),
);
resetConversation();
}}
className={({ isActive }) =>
`${isActive ? 'bg-gray-3000 dark:bg-transparent' : ''
@@ -321,6 +326,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
if (isMobile) {
setNavOpen(!navOpen);
}
resetConversation();
}}
to="/settings"
className={({ isActive }) =>
@@ -344,6 +350,7 @@ export default function Navigation({ navOpen, setNavOpen }: NavigationProps) {
if (isMobile) {
setNavOpen(!navOpen);
}
resetConversation();
}}
to="/about"
className={({ isActive }) =>

View File

@@ -121,7 +121,9 @@ function SourceDropdown({
className="flex cursor-pointer items-center justify-between hover:bg-gray-100 dark:text-bright-gray dark:hover:bg-purple-taupe"
onClick={handleEmptyDocumentSelect}
>
<span className="ml-4 flex-1 overflow-hidden overflow-ellipsis whitespace-nowrap py-3">
<span className="ml-4 flex-1 overflow-hidden overflow-ellipsis whitespace-nowrap py-3" onClick = {() => {
handlePostDocumentSelect(null);
}}>
{t('none')}
</span>
</div>

View File

@@ -54,10 +54,6 @@ export default function Conversation() {
}
}, []);
useEffect(() => {
fetchStream.current && fetchStream.current.abort();
}, [conversationId]);
useEffect(() => {
if (queries.length) {
queries[queries.length - 1].error && setLastQueryReturnedErr(true);

View File

@@ -151,6 +151,7 @@ export const conversationSlice = createSlice({
state,
action: PayloadAction<{ index: number; query: Partial<Query> }>,
) {
if (state.status === 'idle') return;
const { index, query } = action.payload;
if (query.response != undefined) {
state.queries[index].response =
@@ -167,6 +168,7 @@ export const conversationSlice = createSlice({
action: PayloadAction<{ query: Partial<Query> }>,
) {
state.conversationId = action.payload.query.conversationId ?? null;
state.status = 'idle';
},
updateStreamingSource(
state,

View File

@@ -47,6 +47,28 @@ body.dark {
}
}
@layer components {
.table-default {
@apply block w-max table-auto content-center justify-center rounded-xl border border-silver dark:border-silver/40 text-center dark:text-bright-gray;
}
.table-default th {
@apply border-r border-silver dark:border-silver/40 p-4 w-[244px];
}
.table-default th:last-child {
@apply w-[auto] border-r-0;
}
.table-default td {
@apply border-r border-t border-silver dark:border-silver/40 px-4 py-2;
}
.table-default td:last-child {
@apply border-r-0;
}
}
/*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */
/* Document

View File

@@ -85,6 +85,7 @@
"train": "Train",
"link": "Link",
"urlLink": "URL Link",
"repoUrl": "Repository URL",
"reddit": {
"id": "Client ID",
"secret": "Client Secret",

View File

@@ -85,6 +85,7 @@
"train": "Entrenar",
"link": "Enlace",
"urlLink": "Enlace URL",
"repoUrl": "URL del Repositorio",
"reddit": {
"id": "ID de Cliente",
"secret": "Secreto de Cliente",

View File

@@ -85,6 +85,7 @@
"train": "トレーニング",
"link": "リンク",
"urlLink": "URLリンク",
"repoUrl": "リポジトリURL",
"reddit": {
"id": "クライアントID",
"secret": "クライアントシークレット",

View File

@@ -85,6 +85,7 @@
"train": "训练",
"link": "链接",
"urlLink": "URL 链接",
"repoUrl": "存储库 URL",
"reddit": {
"id": "客户端 ID",
"secret": "客户端密钥",

View File

@@ -100,35 +100,29 @@ export default function APIKeys() {
)}
<div className="mt-[27px] w-full">
<div className="w-full overflow-x-auto">
<table className="block w-max table-auto content-center justify-center rounded-xl border text-center dark:border-chinese-silver dark:text-bright-gray">
<table className="table-default">
<thead>
<tr>
<th className="w-[244px] border-r p-4">
{t('settings.apiKeys.name')}
</th>
<th className="w-[244px] border-r px-4 py-2">
{t('settings.apiKeys.sourceDoc')}
</th>
<th className="w-[244px] border-r px-4 py-2">
{t('settings.apiKeys.key')}
</th>
<th className="px-4 py-2"></th>
<th>{t('settings.apiKeys.name')}</th>
<th>{t('settings.apiKeys.sourceDoc')}</th>
<th>{t('settings.apiKeys.key')}</th>
<th></th>
</tr>
</thead>
<tbody>
{!apiKeys?.length && (
<tr>
<td colSpan={4} className="border-t p-4">
<td colSpan={4} className="!p-4">
{t('settings.apiKeys.noData')}
</td>
</tr>
)}
{apiKeys?.map((element, index) => (
<tr key={index}>
<td className="border-r border-t p-4">{element.name}</td>
<td className="border-r border-t p-4">{element.source}</td>
<td className="border-r border-t p-4">{element.key}</td>
<td className="border-t p-4">
<td>{element.name}</td>
<td>{element.source}</td>
<td>{element.key}</td>
<td>
<img
src={Trash}
alt="Delete"

View File

@@ -55,28 +55,20 @@ const Documents: React.FC<DocumentsProps> = ({
<div className="mt-8">
<div className="flex flex-col relative">
<div className="z-10 w-full overflow-x-auto">
<table className="block w-max table-auto content-center justify-center rounded-xl border text-center dark:border-chinese-silver dark:text-bright-gray">
<table className="table-default">
<thead>
<tr>
<th className="border-r p-4 md:w-[244px]">
{t('settings.documents.name')}
</th>
<th className="w-[244px] border-r px-4 py-2">
{t('settings.documents.date')}
</th>
<th className="w-[244px] border-r px-4 py-2">
{t('settings.documents.tokenUsage')}
</th>
<th className="w-[244px] border-r px-4 py-2">
{t('settings.documents.type')}
</th>
<th className="px-4 py-2"></th>
<th>{t('settings.documents.name')}</th>
<th>{t('settings.documents.date')}</th>
<th>{t('settings.documents.tokenUsage')}</th>
<th>{t('settings.documents.type')}</th>
<th></th>
</tr>
</thead>
<tbody>
{!documents?.length && (
<tr>
<td colSpan={5} className="border-t p-4">
<td colSpan={5} className="!p-4">
{t('settings.documents.noData')}
</td>
</tr>
@@ -84,19 +76,15 @@ const Documents: React.FC<DocumentsProps> = ({
{documents &&
documents.map((document, index) => (
<tr key={index}>
<td className="border-r border-t px-4 py-2">
{document.name}
</td>
<td className="border-r border-t px-4 py-2">
{document.date}
</td>
<td className="border-r border-t px-4 py-2">
<td>{document.name}</td>
<td>{document.date}</td>
<td>
{document.tokens ? formatTokens(+document.tokens) : ''}
</td>
<td className="border-r border-t px-4 py-2">
<td>
{document.type === 'remote' ? 'Pre-loaded' : 'Private'}
</td>
<td className="border-t px-4 py-2">
<td>
<div className="flex flex-row items-center">
{document.type !== 'remote' && (
<img

View File

@@ -24,6 +24,7 @@ function Upload({
const [docName, setDocName] = useState('');
const [urlName, setUrlName] = useState('');
const [url, setUrl] = useState('');
const [repoUrl, setRepoUrl] = useState(''); // P3f93
const [redditData, setRedditData] = useState({
client_id: '',
client_secret: '',
@@ -48,6 +49,7 @@ function Upload({
// { label: 'Sitemap', value: 'sitemap' },
{ label: 'Link', value: 'url' },
{ label: 'Reddit', value: 'reddit' },
{ label: 'GitHub', value: 'github' }, // P3f93
];
const [urlType, setUrlType] = useState<{ label: string; value: string }>({
@@ -238,6 +240,9 @@ function Upload({
formData.set('name', 'other');
formData.set('data', JSON.stringify(redditData));
}
if (urlType.value === 'github') {
formData.append('repo_url', repoUrl); // Pdeac
}
const apiHost = import.meta.env.VITE_API_HOST;
const xhr = new XMLHttpRequest();
xhr.upload.addEventListener('progress', (event) => {
@@ -376,7 +381,7 @@ function Upload({
size="w-full"
rounded="3xl"
/>
{urlType.label !== 'Reddit' ? (
{urlType.label !== 'Reddit' && urlType.label !== 'GitHub' ? (
<>
<Input
placeholder={`Enter ${t('modals.uploadDoc.name')}`}
@@ -403,6 +408,33 @@ function Upload({
</span>
</div>
</>
) : urlType.label === 'GitHub' ? ( // P3f93
<>
<Input
placeholder={`Enter ${t('modals.uploadDoc.name')}`}
type="text"
value={urlName}
onChange={(e) => setUrlName(e.target.value)}
borderVariant="thin"
></Input>
<div className="relative bottom-12 left-2 mt-[-20px]">
<span className="bg-white px-2 text-xs text-gray-4000 dark:bg-outer-space dark:text-silver">
{t('modals.uploadDoc.name')}
</span>
</div>
<Input
placeholder={t('modals.uploadDoc.repoUrl')}
type="text"
value={repoUrl}
onChange={(e) => setRepoUrl(e.target.value)}
borderVariant="thin"
></Input>
<div className="relative bottom-12 left-2 mt-[-20px]">
<span className="bg-white px-2 text-xs text-gray-4000 dark:bg-outer-space dark:text-silver">
{t('modals.uploadDoc.repoUrl')}
</span>
</div>
</>
) : (
<div className="flex flex-col gap-1 mt-2">
<div>

View File

@@ -1,13 +1,35 @@
import pymongo
import os
import shutil
import logging
from tqdm import tqdm
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
# Configuration
MONGO_URI = "mongodb://localhost:27017/"
MONGO_ATLAS_URI = "mongodb+srv://<username>:<password>@<cluster>/<dbname>?retryWrites=true&w=majority"
DB_NAME = "docsgpt"
def backup_collection(collection, backup_collection_name):
logger.info(f"Backing up collection {collection.name} to {backup_collection_name}")
collection.aggregate([{"$out": backup_collection_name}])
logger.info("Backup completed")
def migrate_to_v1_vectorstore_mongo():
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["docsgpt"]
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
vectors_collection = db["vectors"]
sources_collection = db["sources"]
for vector in vectors_collection.find():
# Backup collections before migration
backup_collection(vectors_collection, "vectors_backup")
backup_collection(sources_collection, "sources_backup")
vectors = list(vectors_collection.find())
for vector in tqdm(vectors, desc="Updating vectors"):
if "location" in vector:
del vector["location"]
if "retriever" not in vector:
@@ -15,41 +37,53 @@ def migrate_to_v1_vectorstore_mongo():
vector["remote_data"] = None
vectors_collection.update_one({"_id": vector["_id"]}, {"$set": vector})
# move data from vectors_collection to sources_collection
for vector in vectors_collection.find():
# Move data from vectors_collection to sources_collection
for vector in tqdm(vectors, desc="Moving to sources"):
sources_collection.insert_one(vector)
vectors_collection.drop()
client.close()
logger.info("Migration completed")
def migrate_faiss_to_v1_vectorstore():
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["docsgpt"]
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
vectors_collection = db["vectors"]
for vector in vectors_collection.find():
vectors = list(vectors_collection.find())
for vector in tqdm(vectors, desc="Migrating FAISS vectors"):
old_path = f"./application/indexes/{vector['user']}/{vector['name']}"
new_path = f"./application/indexes/{vector['_id']}"
try:
os.rename(old_path, new_path)
os.makedirs(os.path.dirname(new_path), exist_ok=True)
shutil.move(old_path, new_path)
except OSError as e:
print(f"Error moving {old_path} to {new_path}: {e}")
logger.error(f"Error moving {old_path} to {new_path}: {e}")
client.close()
logger.info("FAISS migration completed")
def migrate_mongo_atlas_vector_to_v1_vectorstore():
client = pymongo.MongoClient("mongodb+srv://<username>:<password>@<cluster>/<dbname>?retryWrites=true&w=majority")
db = client["docsgpt"]
client = pymongo.MongoClient(MONGO_ATLAS_URI)
db = client[DB_NAME]
vectors_collection = db["vectors"]
# mongodb atlas collection
documents_collection = db["documents"]
for vector in vectors_collection.find():
documents_collection.update_many({"store": vector["user"] + "/" + vector["name"]}, {"$set": {"source_id": str(vector["_id"])}})
# Backup collections before migration
backup_collection(vectors_collection, "vectors_backup")
backup_collection(documents_collection, "documents_backup")
vectors = list(vectors_collection.find())
for vector in tqdm(vectors, desc="Updating Mongo Atlas vectors"):
documents_collection.update_many(
{"store": vector["user"] + "/" + vector["name"]},
{"$set": {"source_id": str(vector["_id"])}}
)
client.close()
logger.info("Mongo Atlas migration completed")
migrate_faiss_to_v1_vectorstore()
migrate_to_v1_vectorstore_mongo()
if __name__ == "__main__":
migrate_faiss_to_v1_vectorstore()
migrate_to_v1_vectorstore_mongo()
migrate_mongo_atlas_vector_to_v1_vectorstore()