From cce60ce10143d0e7c7d9ce7724a98406fe384b17 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Tue, 19 Nov 2024 16:22:58 +0000
Subject: [PATCH 1/3] fix: save convo messages, docsgpt provider format

---
 application/api/answer/routes.py    | 12 ++++++------
 application/llm/docsgpt_provider.py | 16 +++-------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py
index f109db26..f9ab19be 100644
--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -141,17 +141,17 @@ def save_conversation(conversation_id, question, response, source_log_docs, llm)
                 "role": "assistant",
                 "content": "Summarise following conversation in no more than 3 "
                 "words, respond ONLY with the summary, use the same "
-                "language as the system \n\nUser: "
-                + question
-                + "\n\n"
-                + "AI: "
-                + response,
+                "language as the system",
             },
             {
                 "role": "user",
                 "content": "Summarise following conversation in no more than 3 words, "
                 "respond ONLY with the summary, use the same language as the "
-                "system",
+                "system \n\nUser: "
+                + question
+                + "\n\n"
+                + "AI: "
+                + response,
             },
         ]
 
diff --git a/application/llm/docsgpt_provider.py b/application/llm/docsgpt_provider.py
index bca39729..bb23d824 100644
--- a/application/llm/docsgpt_provider.py
+++ b/application/llm/docsgpt_provider.py
@@ -9,35 +9,25 @@ class DocsGPTAPILLM(BaseLLM):
         super().__init__(*args, **kwargs)
         self.api_key = api_key
         self.user_api_key = user_api_key
-        self.endpoint = "https://llm.docsgpt.co.uk"
+        self.endpoint = "https://llm.arc53.com"
 
     def _raw_gen(self, baseself, model, messages, stream=False, *args, **kwargs):
-        context = messages[0]["content"]
-        user_question = messages[-1]["content"]
-        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
-
         response = requests.post(
-            f"{self.endpoint}/answer", json={"prompt": prompt, "max_new_tokens": 30}
+            f"{self.endpoint}/answer", json={"messages": messages, "max_new_tokens": 30}
         )
         response_clean = response.json()["a"].replace("###", "")
 
         return response_clean
 
     def _raw_gen_stream(self, baseself, model, messages, stream=True, *args, **kwargs):
-        context = messages[0]["content"]
-        user_question = messages[-1]["content"]
-        prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n"
-
-        # send prompt to endpoint /stream
         response = requests.post(
             f"{self.endpoint}/stream",
-            json={"prompt": prompt, "max_new_tokens": 256},
+            json={"messages": messages, "max_new_tokens": 256},
             stream=True,
         )
 
         for line in response.iter_lines():
             if line:
-                # data = json.loads(line)
                 data_str = line.decode("utf-8")
                 if data_str.startswith("data: "):
                     data = json.loads(data_str[6:])

From 312cb9ae7046d82e1d43d4ee835be8b1fc56b9bd Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Tue, 19 Nov 2024 19:06:53 +0000
Subject: [PATCH 2/3] feat: image parser

---
 application/api/user/routes.py          |  6 ++++++
 application/core/settings.py            |  1 +
 application/parser/file/bulk.py         |  4 ++++
 application/parser/file/docs_parser.py  | 12 ++++++++++-
 application/parser/file/image_parser.py | 28 +++++++++++++++++++++++++
 frontend/src/upload/Upload.tsx          |  3 +++
 6 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 application/parser/file/image_parser.py

diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index 6a2f3bea..e305845d 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -339,6 +339,9 @@ class UploadFile(Resource):
                         ".json",
                         ".xlsx",
                         ".pptx",
+                        ".png",
+                        ".jpg",
+                        ".jpeg",
                     ],
                     job_name,
                     final_filename,
@@ -365,6 +368,9 @@ class UploadFile(Resource):
                         ".json",
                         ".xlsx",
                         ".pptx",
+                        ".png",
+                        ".jpg",
+                        ".jpeg",
                     ],
                     job_name,
                     final_filename,
diff --git a/application/core/settings.py b/application/core/settings.py
index d4b02481..a7811ec7 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -18,6 +18,7 @@ class Settings(BaseSettings):
     DEFAULT_MAX_HISTORY: int = 150
     MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
     UPLOAD_FOLDER: str = "inputs"
+    PARSE_PDF_AS_IMAGE: bool = False
     VECTOR_STORE: str = "faiss" #  "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
     RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
 
diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index 3b8fbca8..8201b3f2 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -13,6 +13,7 @@ from application.parser.file.rst_parser import RstParser
 from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
 from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
+from application.parser.file.image_parser import ImageParser
 from application.parser.schema.base import Document
 
 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -27,6 +28,9 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
     ".mdx": MarkdownParser(),
     ".json":JSONParser(),
     ".pptx":PPTXParser(),
+    ".png": ImageParser(),
+    ".jpg": ImageParser(),
+    ".jpeg": ImageParser(),
 }
 
 
diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py
index 861e8e58..55d45a64 100644
--- a/application/parser/file/docs_parser.py
+++ b/application/parser/file/docs_parser.py
@@ -7,7 +7,8 @@ from pathlib import Path
 from typing import Dict
 
 from application.parser.file.base_parser import BaseParser
-
+from application.core.settings import settings
+import requests
 
 class PDFParser(BaseParser):
     """PDF parser."""
@@ -18,6 +19,15 @@ class PDFParser(BaseParser):
 
     def parse_file(self, file: Path, errors: str = "ignore") -> str:
         """Parse file."""
+        if settings.PARSE_PDF_AS_IMAGE:
+            doc2md_service = "https://llm.arc53.com/doc2md"
+            # alternatively you can use local vision capable LLM
+            with open(file, "rb") as file_loaded:
+                files = {'file': file_loaded}
+                response = requests.post(doc2md_service, files=files)   
+                data = response.json()["markdown"] 
+            return data
+
         try:
             import PyPDF2
         except ImportError:
diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py
new file mode 100644
index 00000000..508693d8
--- /dev/null
+++ b/application/parser/file/image_parser.py
@@ -0,0 +1,28 @@
+"""Image parser.
+
+Contains parser for .png, .jpg, .jpeg files.
+
+"""
+from pathlib import Path
+import requests
+from typing import Dict, Union
+import traceback
+
+from application.parser.file.base_parser import BaseParser
+
+
+class ImageParser(BaseParser):
+    """Image parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+        doc2md_service = "https://llm.arc53.com/doc2md"
+        # alternatively you can use local vision capable LLM
+        with open(file, "rb") as file_loaded:
+            files = {'file': file_loaded}
+            response = requests.post(doc2md_service, files=files)   
+            data = response.json()["markdown"] 
+        return data
diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index 2f28042a..33a77ace 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -332,6 +332,9 @@ function Upload({
       ],
       'application/vnd.openxmlformats-officedocument.presentationml.presentation':
         ['.pptx'],
+      'image/png': ['.png'],
+      'image/jpeg': ['.jpeg'],
+      'image/jpg': ['.jpg'],
     },
   });
 

From f65ecb9a0f116bc926637f1832c911ea2022d551 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Tue, 19 Nov 2024 19:16:24 +0000
Subject: [PATCH 3/3] fix: lint import

---
 application/parser/file/image_parser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py
index 508693d8..fd800d91 100644
--- a/application/parser/file/image_parser.py
+++ b/application/parser/file/image_parser.py
@@ -6,7 +6,6 @@ Contains parser for .png, .jpg, .jpeg files.
 from pathlib import Path
 import requests
 from typing import Dict, Union
-import traceback
 
 from application.parser.file.base_parser import BaseParser