mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-30 09:03:15 +00:00
Merge branch 'main' of https://github.com/arc53/DocsGPT
This commit is contained in:
@@ -23,8 +23,6 @@ Say goodbye to time-consuming manual searches, and let <strong><a href="https://
|
||||
|
||||
</div>
|
||||
|
||||
### 🎃 [Hacktoberfest Prizes, Rules & Q&A](https://github.com/arc53/DocsGPT/blob/main/HACKTOBERFEST.md) 🎃
|
||||
|
||||
### Production Support / Help for Companies:
|
||||
|
||||
We're eager to provide personalized assistance when deploying your DocsGPT to a live environment.
|
||||
|
||||
@@ -11,8 +11,8 @@ from bson.objectid import ObjectId
|
||||
from flask import Blueprint, current_app, make_response, request, Response
|
||||
from flask_restx import fields, Namespace, Resource
|
||||
|
||||
from pymongo import MongoClient
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
from application.core.settings import settings
|
||||
from application.error import bad_request
|
||||
from application.extensions import api
|
||||
@@ -22,7 +22,7 @@ from application.utils import check_required_fields
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
sources_collection = db["sources"]
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
import os
|
||||
import datetime
|
||||
from flask import Blueprint, request, send_from_directory
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
from application.core.settings import settings
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
sources_collection = db["sources"]
|
||||
|
||||
@@ -8,18 +8,18 @@ from bson.dbref import DBRef
|
||||
from bson.objectid import ObjectId
|
||||
from flask import Blueprint, jsonify, make_response, request
|
||||
from flask_restx import inputs, fields, Namespace, Resource
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from application.api.user.tasks import ingest, ingest_remote
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
from application.core.settings import settings
|
||||
from application.extensions import api
|
||||
from application.utils import check_required_fields
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.tts.google_tts import GoogleTTS
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
conversations_collection = db["conversations"]
|
||||
sources_collection = db["sources"]
|
||||
@@ -343,6 +343,7 @@ class UploadFile(Resource):
|
||||
".mdx",
|
||||
".json",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
],
|
||||
job_name,
|
||||
final_filename,
|
||||
|
||||
24
application/core/mongo_db.py
Normal file
24
application/core/mongo_db.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from application.core.settings import settings
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
class MongoDB:
|
||||
_client = None
|
||||
|
||||
@classmethod
|
||||
def get_client(cls):
|
||||
"""
|
||||
Get the MongoDB client instance, creating it if necessary.
|
||||
"""
|
||||
if cls._client is None:
|
||||
cls._client = MongoClient(settings.MONGO_URI)
|
||||
return cls._client
|
||||
|
||||
@classmethod
|
||||
def close_client(cls):
|
||||
"""
|
||||
Close the MongoDB client connection.
|
||||
"""
|
||||
if cls._client is not None:
|
||||
cls._client.close()
|
||||
cls._client = None
|
||||
@@ -12,6 +12,7 @@ from application.parser.file.markdown_parser import MarkdownParser
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
||||
from application.parser.file.json_parser import JSONParser
|
||||
from application.parser.file.pptx_parser import PPTXParser
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
@@ -25,6 +26,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".html": HTMLParser(),
|
||||
".mdx": MarkdownParser(),
|
||||
".json":JSONParser(),
|
||||
".pptx":PPTXParser(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
75
application/parser/file/pptx_parser.py
Normal file
75
application/parser/file/pptx_parser.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""PPT parser.
|
||||
Contains parsers for presentation (.pptx) files to extract slide text.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
class PPTXParser(BaseParser):
|
||||
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
|
||||
Args:
|
||||
concat_slides (bool): Specifies whether to concatenate all slide text into one document.
|
||||
- If True, slide texts will be joined together as a single string.
|
||||
- If False, each slide's text will be stored as a separate entry in a list.
|
||||
Set to True by default.
|
||||
slide_separator (str): Separator used to join slides' text content.
|
||||
Only used when `concat_slides=True`. Default is "\n".
|
||||
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
concat_slides: bool = True,
|
||||
slide_separator: str = "\n",
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._concat_slides = concat_slides
|
||||
self._slide_separator = slide_separator
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
||||
r"""
|
||||
Parse a .pptx file and extract text from each slide.
|
||||
Args:
|
||||
file (Path): Path to the .pptx file.
|
||||
errors (str): Error handling policy ('ignore' by default).
|
||||
Returns:
|
||||
Union[str, List[str]]: Concatenated text if concat_slides is True,
|
||||
otherwise a list of slide texts.
|
||||
"""
|
||||
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except ImportError:
|
||||
raise ImportError("pptx module is required to read .PPTX files.")
|
||||
|
||||
try:
|
||||
presentation = Presentation(file)
|
||||
slide_texts=[]
|
||||
|
||||
# Iterate over each slide in the presentation
|
||||
for slide in presentation.slides:
|
||||
slide_text=""
|
||||
|
||||
# Iterate over each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
# Check if the shape has a 'text' attribute and append that to the slide_text
|
||||
if hasattr(shape,"text"):
|
||||
slide_text+=shape.text
|
||||
|
||||
slide_texts.append(slide_text.strip())
|
||||
|
||||
if self._concat_slides:
|
||||
return self._slide_separator.join(slide_texts)
|
||||
else:
|
||||
return slide_texts
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
@@ -14,6 +14,7 @@ esutils==1.0.1
|
||||
Flask==3.0.3
|
||||
faiss-cpu==1.8.0.post1
|
||||
flask-restx==1.3.0
|
||||
gTTS==2.3.2
|
||||
gunicorn==23.0.0
|
||||
html2text==2024.2.26
|
||||
javalang==0.13.0
|
||||
@@ -65,6 +66,7 @@ pymongo==4.8.0
|
||||
pypdf2==3.0.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-pptx==1.0.2
|
||||
qdrant-client==1.11.0
|
||||
redis==5.0.1
|
||||
referencing==0.30.2
|
||||
@@ -84,5 +86,4 @@ urllib3==2.2.3
|
||||
vine==5.1.0
|
||||
wcwidth==0.2.13
|
||||
werkzeug==3.0.4
|
||||
yarl==1.11.1
|
||||
gTTS==2.3.2
|
||||
yarl==1.11.1
|
||||
@@ -75,7 +75,6 @@ class BraveRetSearch(BaseRetriever):
|
||||
if len(self.chat_history) > 1:
|
||||
tokens_current_history = 0
|
||||
# count tokens in history
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
|
||||
@@ -78,7 +78,6 @@ class ClassicRAG(BaseRetriever):
|
||||
if len(self.chat_history) > 1:
|
||||
tokens_current_history = 0
|
||||
# count tokens in history
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
@@ -97,7 +96,6 @@ class ClassicRAG(BaseRetriever):
|
||||
llm = LLMCreator.create_llm(
|
||||
settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=self.user_api_key
|
||||
)
|
||||
|
||||
completion = llm.gen_stream(model=self.gpt_model, messages=messages_combine)
|
||||
for line in completion:
|
||||
yield {"answer": str(line)}
|
||||
|
||||
@@ -92,7 +92,6 @@ class DuckDuckSearch(BaseRetriever):
|
||||
if len(self.chat_history) > 1:
|
||||
tokens_current_history = 0
|
||||
# count tokens in history
|
||||
self.chat_history.reverse()
|
||||
for i in self.chat_history:
|
||||
if "prompt" in i and "response" in i:
|
||||
tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string(
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import sys
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
from application.core.settings import settings
|
||||
from application.core.mongo_db import MongoDB
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
usage_collection = db["token_usage"]
|
||||
|
||||
|
||||
@@ -8,8 +8,8 @@ from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bson.objectid import ObjectId
|
||||
from pymongo import MongoClient
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
from application.core.settings import settings
|
||||
from application.parser.file.bulk import SimpleDirectoryReader
|
||||
from application.parser.open_ai_func import call_openai_api
|
||||
@@ -18,7 +18,7 @@ from application.parser.schema.base import Document
|
||||
from application.parser.token_func import group_split
|
||||
from application.utils import count_tokens_docs
|
||||
|
||||
mongo = MongoClient(settings.MONGO_URI)
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
sources_collection = db["sources"]
|
||||
|
||||
|
||||
@@ -51,6 +51,9 @@ const config = {
|
||||
footer: {
|
||||
text: `MIT ${new Date().getFullYear()} © DocsGPT`,
|
||||
},
|
||||
editLink: {
|
||||
content: 'Edit this page on GitHub',
|
||||
},
|
||||
logo() {
|
||||
return (
|
||||
<div className="flex items-center gap-2">
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
:root {
|
||||
--viewport-height: 100vh;
|
||||
font-synthesis: none !important;
|
||||
}
|
||||
|
||||
@supports (height: 100dvh) {
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"cancel": "Cancel",
|
||||
"help": "Help",
|
||||
"emailUs": "Email us",
|
||||
"documentation": "documentation",
|
||||
"documentation": "Documentation",
|
||||
"demo": [
|
||||
{
|
||||
"header": "Learn about DocsGPT",
|
||||
@@ -86,7 +86,7 @@
|
||||
"start": "Start Chatting",
|
||||
"name": "Name",
|
||||
"choose": "Choose Files",
|
||||
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb",
|
||||
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limited to 25mb",
|
||||
"uploadedFiles": "Uploaded Files",
|
||||
"cancel": "Cancel",
|
||||
"train": "Train",
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"cancel": "Cancelar",
|
||||
"help": "Asistencia",
|
||||
"emailUs": "Envíanos un correo",
|
||||
"documentation": "documentación",
|
||||
"documentation": "Documentación",
|
||||
"demo": [
|
||||
{
|
||||
"header": "Aprende sobre DocsGPT",
|
||||
@@ -86,7 +86,7 @@
|
||||
"start": "Empezar a chatear",
|
||||
"name": "Nombre",
|
||||
"choose": "Seleccionar Archivos",
|
||||
"info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB",
|
||||
"info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limitados a 25 MB",
|
||||
"uploadedFiles": "Archivos Subidos",
|
||||
"cancel": "Cancelar",
|
||||
"train": "Entrenar",
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
"start": "チャットを開始する",
|
||||
"name": "名前",
|
||||
"choose": "ファイルを選択",
|
||||
"info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください",
|
||||
"info": ".pdf, .txt, .rst, .docx, .md, .json, .pptx, .zipファイルを25MBまでアップロードしてください",
|
||||
"uploadedFiles": "アップロードされたファイル",
|
||||
"cancel": "キャンセル",
|
||||
"train": "トレーニング",
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
"remote": "遠端",
|
||||
"name": "名稱",
|
||||
"choose": "選擇檔案",
|
||||
"info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案,大小限制為 25MB",
|
||||
"info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .pptx, .zip 檔案,大小限制為 25MB",
|
||||
"uploadedFiles": "已上傳的檔案",
|
||||
"cancel": "取消",
|
||||
"train": "訓練",
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
"start": "开始聊天",
|
||||
"name": "名称",
|
||||
"choose": "选择文件",
|
||||
"info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件,限 25MB",
|
||||
"info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip 文件,限 25MB",
|
||||
"uploadedFiles": "已上传文件",
|
||||
"cancel": "取消",
|
||||
"train": "训练",
|
||||
|
||||
@@ -321,6 +321,8 @@ function Upload({
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [
|
||||
'.xlsx',
|
||||
],
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
|
||||
['.pptx'],
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ Welcome to the LLM Document Analysis by [LexEU](https://www.lexeu.ai/) competiti
|
||||
|
||||
### 📆 Timeline:
|
||||
- **Competition Announcement:** 1st October
|
||||
- **Deadline for Submissions:** 27th October
|
||||
- **Results Announcement:** Early November/ Late October
|
||||
- **Deadline for Submissions:** 8th November
|
||||
- **Results Announcement:** Early November
|
||||
|
||||
## 📜 How to Participate:
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ source .env
|
||||
|
||||
if [[ -n "$OPENAI_API_BASE" ]] && [[ -n "$OPENAI_API_VERSION" ]] && [[ -n "$AZURE_DEPLOYMENT_NAME" ]] && [[ -n "$AZURE_EMBEDDINGS_DEPLOYMENT_NAME" ]]; then
|
||||
echo "Running Azure Configuration"
|
||||
docker compose -f docker-compose-azure.yaml build && docker compose -f docker-compose-azure.yaml up
|
||||
docker compose -f docker-compose-azure.yaml up --build
|
||||
else
|
||||
echo "Running Plain Configuration"
|
||||
docker compose build && docker compose up
|
||||
docker compose up --build
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user