From cce60ce10143d0e7c7d9ce7724a98406fe384b17 Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 19 Nov 2024 16:22:58 +0000
Subject: [PATCH 1/3] fix: save convo messages, docsgpt provider format
---
application/api/answer/routes.py | 12 ++++++------
application/llm/docsgpt_provider.py | 16 +++-------------
2 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py
index f109db26..f9ab19be 100644
--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -141,17 +141,17 @@ def save_conversation(conversation_id, question, response, source_log_docs, llm)
"role": "assistant",
"content": "Summarise following conversation in no more than 3 "
"words, respond ONLY with the summary, use the same "
- "language as the system \n\nUser: "
- + question
- + "\n\n"
- + "AI: "
- + response,
+ "language as the system",
},
{
"role": "user",
"content": "Summarise following conversation in no more than 3 words, "
"respond ONLY with the summary, use the same language as the "
- "system",
+ "system \n\nUser: "
+ + question
+ + "\n\n"
+ + "AI: "
+ + response,
},
]
diff --git a/application/llm/docsgpt_provider.py b/application/llm/docsgpt_provider.py
index bca39729..bb23d824 100644
--- a/application/llm/docsgpt_provider.py
+++ b/application/llm/docsgpt_provider.py
@@ -9,35 +9,25 @@ class DocsGPTAPILLM(BaseLLM):
super().__init__(*args, **kwargs)
self.api_key = api_key
self.user_api_key = user_api_key
- self.endpoint = "https://llm.docsgpt.co.uk"
+ self.endpoint = "https://llm.arc53.com"
def _raw_gen(self, baseself, model, messages, stream=False, *args, **kwargs):
- context = messages[0]["content"]
- user_question = messages[-1]["content"]
- prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
-
response = requests.post(
- f"{self.endpoint}/answer", json={"prompt": prompt, "max_new_tokens": 30}
+ f"{self.endpoint}/answer", json={"messages": messages, "max_new_tokens": 30}
)
response_clean = response.json()["a"].replace("###", "")
return response_clean
def _raw_gen_stream(self, baseself, model, messages, stream=True, *args, **kwargs):
- context = messages[0]["content"]
- user_question = messages[-1]["content"]
- prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
-
- # send prompt to endpoint /stream
response = requests.post(
f"{self.endpoint}/stream",
- json={"prompt": prompt, "max_new_tokens": 256},
+ json={"messages": messages, "max_new_tokens": 256},
stream=True,
)
for line in response.iter_lines():
if line:
- # data = json.loads(line)
data_str = line.decode("utf-8")
if data_str.startswith("data: "):
data = json.loads(data_str[6:])
From 312cb9ae7046d82e1d43d4ee835be8b1fc56b9bd Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 19 Nov 2024 19:06:53 +0000
Subject: [PATCH 2/3] feat: image parser
---
application/api/user/routes.py | 6 ++++++
application/core/settings.py | 1 +
application/parser/file/bulk.py | 4 ++++
application/parser/file/docs_parser.py | 12 ++++++++++-
application/parser/file/image_parser.py | 28 +++++++++++++++++++++++++
frontend/src/upload/Upload.tsx | 3 +++
6 files changed, 53 insertions(+), 1 deletion(-)
create mode 100644 application/parser/file/image_parser.py
diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index 6a2f3bea..e305845d 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -339,6 +339,9 @@ class UploadFile(Resource):
".json",
".xlsx",
".pptx",
+ ".png",
+ ".jpg",
+ ".jpeg",
],
job_name,
final_filename,
@@ -365,6 +368,9 @@ class UploadFile(Resource):
".json",
".xlsx",
".pptx",
+ ".png",
+ ".jpg",
+ ".jpeg",
],
job_name,
final_filename,
diff --git a/application/core/settings.py b/application/core/settings.py
index d4b02481..a7811ec7 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -18,6 +18,7 @@ class Settings(BaseSettings):
DEFAULT_MAX_HISTORY: int = 150
MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
UPLOAD_FOLDER: str = "inputs"
+ PARSE_PDF_AS_IMAGE: bool = False
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index 3b8fbca8..8201b3f2 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -13,6 +13,7 @@ from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
+from application.parser.file.image_parser import ImageParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -27,6 +28,9 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
+ ".png": ImageParser(),
+ ".jpg": ImageParser(),
+ ".jpeg": ImageParser(),
}
diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py
index 861e8e58..55d45a64 100644
--- a/application/parser/file/docs_parser.py
+++ b/application/parser/file/docs_parser.py
@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Dict
from application.parser.file.base_parser import BaseParser
-
+from application.core.settings import settings
+import requests
class PDFParser(BaseParser):
"""PDF parser."""
@@ -18,6 +19,15 @@ class PDFParser(BaseParser):
def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
+ if settings.PARSE_PDF_AS_IMAGE:
+ doc2md_service = "https://llm.arc53.com/doc2md"
+ # alternatively you can use local vision capable LLM
+ with open(file, "rb") as file_loaded:
+ files = {'file': file_loaded}
+ response = requests.post(doc2md_service, files=files)
+ data = response.json()["markdown"]
+ return data
+
try:
import PyPDF2
except ImportError:
diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py
new file mode 100644
index 00000000..508693d8
--- /dev/null
+++ b/application/parser/file/image_parser.py
@@ -0,0 +1,28 @@
+"""Image parser.
+
+Contains parser for .png, .jpg, .jpeg files.
+
+"""
+from pathlib import Path
+import requests
+from typing import Dict, Union
+import traceback
+
+from application.parser.file.base_parser import BaseParser
+
+
+class ImageParser(BaseParser):
+ """Image parser."""
+
+ def _init_parser(self) -> Dict:
+ """Init parser."""
+ return {}
+
+ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+ doc2md_service = "https://llm.arc53.com/doc2md"
+ # alternatively you can use local vision capable LLM
+ with open(file, "rb") as file_loaded:
+ files = {'file': file_loaded}
+ response = requests.post(doc2md_service, files=files)
+ data = response.json()["markdown"]
+ return data
diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index 2f28042a..33a77ace 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -332,6 +332,9 @@ function Upload({
],
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
['.pptx'],
+ 'image/png': ['.png'],
+ 'image/jpeg': ['.jpeg'],
+ 'image/jpg': ['.jpg'],
},
});
From f65ecb9a0f116bc926637f1832c911ea2022d551 Mon Sep 17 00:00:00 2001
From: Alex
Date: Tue, 19 Nov 2024 19:16:24 +0000
Subject: [PATCH 3/3] fix: lint import
---
application/parser/file/image_parser.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py
index 508693d8..fd800d91 100644
--- a/application/parser/file/image_parser.py
+++ b/application/parser/file/image_parser.py
@@ -6,7 +6,6 @@ Contains parser for .png, .jpg, .jpeg files.
from pathlib import Path
import requests
from typing import Dict, Union
-import traceback
from application.parser.file.base_parser import BaseParser