diff --git a/README.md b/README.md index 8f5897fa..eeecb598 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,6 @@ Say goodbye to time-consuming manual searches, and let None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_slides = concat_slides + self._slide_separator = slide_separator + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + r""" + Parse a .pptx file and extract text from each slide. + Args: + file (Path): Path to the .pptx file. + errors (str): Error handling policy ('ignore' by default). + Returns: + Union[str, List[str]]: Concatenated text if concat_slides is True, + otherwise a list of slide texts. + """ + + try: + from pptx import Presentation + except ImportError: + raise ImportError("pptx module is required to read .PPTX files.") + + try: + presentation = Presentation(file) + slide_texts=[] + + # Iterate over each slide in the presentation + for slide in presentation.slides: + slide_text="" + + # Iterate over each shape in the slide + for shape in slide.shapes: + # Check if the shape has a 'text' attribute and append that to the slide_text + if hasattr(shape,"text"): + slide_text+=shape.text + + slide_texts.append(slide_text.strip()) + + if self._concat_slides: + return self._slide_separator.join(slide_texts) + else: + return slide_texts + + except Exception as e: + raise e \ No newline at end of file diff --git a/application/requirements.txt b/application/requirements.txt index aad629f1..2f28c2ea 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -14,6 +14,7 @@ esutils==1.0.1 Flask==3.0.3 faiss-cpu==1.8.0.post1 flask-restx==1.3.0 +gTTS==2.3.2 gunicorn==23.0.0 html2text==2024.2.26 javalang==0.13.0 @@ -65,6 +66,7 @@ pymongo==4.8.0 pypdf2==3.0.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 +python-pptx==1.0.2 qdrant-client==1.11.0 redis==5.0.1 referencing==0.30.2 @@ -84,5 +86,4 @@ urllib3==2.2.3 vine==5.1.0 wcwidth==0.2.13 werkzeug==3.0.4 -yarl==1.11.1 -gTTS==2.3.2 \ No newline at end of file +yarl==1.11.1 \ No newline at end of file diff --git a/application/retriever/brave_search.py b/application/retriever/brave_search.py index 29666a57..1fd844b2 100644 --- a/application/retriever/brave_search.py +++ b/application/retriever/brave_search.py @@ -75,7 +75,6 @@ class BraveRetSearch(BaseRetriever): if len(self.chat_history) > 1: tokens_current_history = 0 # count tokens in history - self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py index b87b5852..6a67cb38 100644 --- a/application/retriever/classic_rag.py +++ b/application/retriever/classic_rag.py @@ -78,7 +78,6 @@ class ClassicRAG(BaseRetriever): if len(self.chat_history) > 1: tokens_current_history = 0 # count tokens in history - self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( @@ -97,7 +96,6 @@ class ClassicRAG(BaseRetriever): llm = LLMCreator.create_llm( settings.LLM_NAME, api_key=settings.API_KEY, user_api_key=self.user_api_key ) - completion = llm.gen_stream(model=self.gpt_model, messages=messages_combine) for line in completion: yield {"answer": str(line)} diff --git a/application/retriever/duckduck_search.py b/application/retriever/duckduck_search.py index d746ecaa..6ae56226 100644 --- a/application/retriever/duckduck_search.py +++ b/application/retriever/duckduck_search.py @@ -92,7 +92,6 @@ class DuckDuckSearch(BaseRetriever): if len(self.chat_history) > 1: tokens_current_history = 0 # count tokens in history - self.chat_history.reverse() for i in self.chat_history: if "prompt" in i and "response" in i: tokens_batch = num_tokens_from_string(i["prompt"]) + num_tokens_from_string( diff --git a/application/usage.py b/application/usage.py index aba0ec77..e87ebe38 100644 --- a/application/usage.py +++ b/application/usage.py @@ -1,10 +1,9 @@ import sys -from pymongo import MongoClient from datetime import datetime -from application.core.settings import settings +from application.core.mongo_db import MongoDB from application.utils import num_tokens_from_string -mongo = MongoClient(settings.MONGO_URI) +mongo = MongoDB.get_client() db = mongo["docsgpt"] usage_collection = db["token_usage"] diff --git a/application/worker.py b/application/worker.py index f8f38afa..33cd90e5 100755 --- a/application/worker.py +++ b/application/worker.py @@ -8,8 +8,8 @@ from urllib.parse import urljoin import requests from bson.objectid import ObjectId -from pymongo import MongoClient +from application.core.mongo_db import MongoDB from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader from application.parser.open_ai_func import call_openai_api @@ -18,7 +18,7 @@ from application.parser.schema.base import Document from application.parser.token_func import group_split from application.utils import count_tokens_docs -mongo = MongoClient(settings.MONGO_URI) +mongo = MongoDB.get_client() db = mongo["docsgpt"] sources_collection = db["sources"] diff --git a/docs/theme.config.jsx b/docs/theme.config.jsx index 2b868db7..777a0ed5 100644 --- a/docs/theme.config.jsx +++ b/docs/theme.config.jsx @@ -51,6 +51,9 @@ const config = { footer: { text: `MIT ${new Date().getFullYear()} © DocsGPT`, }, + editLink: { + content: 'Edit this page on GitHub', + }, logo() { return (
diff --git a/frontend/src/index.css b/frontend/src/index.css index 9b87724a..4319403e 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -4,6 +4,7 @@ :root { --viewport-height: 100vh; + font-synthesis: none !important; } @supports (height: 100dvh) { diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index a1f254ac..b1268df7 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -12,7 +12,7 @@ "cancel": "Cancel", "help": "Help", "emailUs": "Email us", - "documentation": "documentation", + "documentation": "Documentation", "demo": [ { "header": "Learn about DocsGPT", @@ -86,7 +86,7 @@ "start": "Start Chatting", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index 6a096ffd..296dbb16 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -12,7 +12,7 @@ "cancel": "Cancelar", "help": "Asistencia", "emailUs": "Envíanos un correo", - "documentation": "documentación", + "documentation": "Documentación", "demo": [ { "header": "Aprende sobre DocsGPT", @@ -86,7 +86,7 @@ "start": "Empezar a chatear", "name": "Nombre", "choose": "Seleccionar Archivos", - "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB", + "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limitados a 25 MB", "uploadedFiles": "Archivos Subidos", "cancel": "Cancelar", "train": "Entrenar", diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index 841a477b..a69ae31f 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -86,7 +86,7 @@ "start": "チャットを開始する", "name": "名前", "choose": "ファイルを選択", - "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください", + "info": ".pdf, .txt, .rst, .docx, .md, .json, .pptx, .zipファイルを25MBまでアップロードしてください", "uploadedFiles": "アップロードされたファイル", "cancel": "キャンセル", "train": "トレーニング", diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json index 35df818b..fa0638f4 100644 --- a/frontend/src/locale/zh-TW.json +++ b/frontend/src/locale/zh-TW.json @@ -80,7 +80,7 @@ "remote": "遠端", "name": "名稱", "choose": "選擇檔案", - "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案,大小限制為 25MB", + "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .pptx, .zip 檔案,大小限制為 25MB", "uploadedFiles": "已上傳的檔案", "cancel": "取消", "train": "訓練", diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index 710c5e3e..51f8bfe9 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -86,7 +86,7 @@ "start": "开始聊天", "name": "名称", "choose": "选择文件", - "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件,限 25MB", + "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip 文件,限 25MB", "uploadedFiles": "已上传文件", "cancel": "取消", "train": "训练", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 2da284c3..81ce9f2b 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -321,6 +321,8 @@ function Upload({ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [ '.xlsx', ], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': + ['.pptx'], }, }); diff --git a/lexeu-competition.md b/lexeu-competition.md index 1077de29..e8824438 100644 --- a/lexeu-competition.md +++ b/lexeu-competition.md @@ -12,8 +12,8 @@ Welcome to the LLM Document Analysis by [LexEU](https://www.lexeu.ai/) competiti ### 📆 Timeline: - **Competition Announcement:** 1st October -- **Deadline for Submissions:** 27th October -- **Results Announcement:** Early November/ Late October +- **Deadline for Submissions:** 8th November +- **Results Announcement:** Early November ## 📜 How to Participate: diff --git a/run-with-docker-compose.sh b/run-with-docker-compose.sh index 61aab467..145b1e23 100755 --- a/run-with-docker-compose.sh +++ b/run-with-docker-compose.sh @@ -4,8 +4,8 @@ source .env if [[ -n "$OPENAI_API_BASE" ]] && [[ -n "$OPENAI_API_VERSION" ]] && [[ -n "$AZURE_DEPLOYMENT_NAME" ]] && [[ -n "$AZURE_EMBEDDINGS_DEPLOYMENT_NAME" ]]; then echo "Running Azure Configuration" - docker compose -f docker-compose-azure.yaml build && docker compose -f docker-compose-azure.yaml up + docker compose -f docker-compose-azure.yaml up --build else echo "Running Plain Configuration" - docker compose build && docker compose up + docker compose up --build fi