mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
feat: image parser
This commit is contained in:
@@ -339,6 +339,9 @@ class UploadFile(Resource):
|
|||||||
".json",
|
".json",
|
||||||
".xlsx",
|
".xlsx",
|
||||||
".pptx",
|
".pptx",
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
],
|
],
|
||||||
job_name,
|
job_name,
|
||||||
final_filename,
|
final_filename,
|
||||||
@@ -365,6 +368,9 @@ class UploadFile(Resource):
|
|||||||
".json",
|
".json",
|
||||||
".xlsx",
|
".xlsx",
|
||||||
".pptx",
|
".pptx",
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
],
|
],
|
||||||
job_name,
|
job_name,
|
||||||
final_filename,
|
final_filename,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ class Settings(BaseSettings):
|
|||||||
DEFAULT_MAX_HISTORY: int = 150
|
DEFAULT_MAX_HISTORY: int = 150
|
||||||
MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
|
MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
|
||||||
UPLOAD_FOLDER: str = "inputs"
|
UPLOAD_FOLDER: str = "inputs"
|
||||||
|
PARSE_PDF_AS_IMAGE: bool = False
|
||||||
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
|
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
|
||||||
RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
|
RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from application.parser.file.rst_parser import RstParser
|
|||||||
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
||||||
from application.parser.file.json_parser import JSONParser
|
from application.parser.file.json_parser import JSONParser
|
||||||
from application.parser.file.pptx_parser import PPTXParser
|
from application.parser.file.pptx_parser import PPTXParser
|
||||||
|
from application.parser.file.image_parser import ImageParser
|
||||||
from application.parser.schema.base import Document
|
from application.parser.schema.base import Document
|
||||||
|
|
||||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||||
@@ -27,6 +28,9 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
|||||||
".mdx": MarkdownParser(),
|
".mdx": MarkdownParser(),
|
||||||
".json":JSONParser(),
|
".json":JSONParser(),
|
||||||
".pptx":PPTXParser(),
|
".pptx":PPTXParser(),
|
||||||
|
".png": ImageParser(),
|
||||||
|
".jpg": ImageParser(),
|
||||||
|
".jpeg": ImageParser(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ from pathlib import Path
|
|||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
from application.parser.file.base_parser import BaseParser
|
from application.parser.file.base_parser import BaseParser
|
||||||
|
from application.core.settings import settings
|
||||||
|
import requests
|
||||||
|
|
||||||
class PDFParser(BaseParser):
|
class PDFParser(BaseParser):
|
||||||
"""PDF parser."""
|
"""PDF parser."""
|
||||||
@@ -18,6 +19,15 @@ class PDFParser(BaseParser):
|
|||||||
|
|
||||||
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||||||
"""Parse file."""
|
"""Parse file."""
|
||||||
|
if settings.PARSE_PDF_AS_IMAGE:
|
||||||
|
doc2md_service = "https://llm.arc53.com/doc2md"
|
||||||
|
# alternatively you can use local vision capable LLM
|
||||||
|
with open(file, "rb") as file_loaded:
|
||||||
|
files = {'file': file_loaded}
|
||||||
|
response = requests.post(doc2md_service, files=files)
|
||||||
|
data = response.json()["markdown"]
|
||||||
|
return data
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|||||||
28
application/parser/file/image_parser.py
Normal file
28
application/parser/file/image_parser.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
"""Image parser.
|
||||||
|
|
||||||
|
Contains parser for .png, .jpg, .jpeg files.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
import requests
|
||||||
|
from typing import Dict, Union
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from application.parser.file.base_parser import BaseParser
|
||||||
|
|
||||||
|
|
||||||
|
class ImageParser(BaseParser):
|
||||||
|
"""Image parser."""
|
||||||
|
|
||||||
|
def _init_parser(self) -> Dict:
|
||||||
|
"""Init parser."""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
|
||||||
|
doc2md_service = "https://llm.arc53.com/doc2md"
|
||||||
|
# alternatively you can use local vision capable LLM
|
||||||
|
with open(file, "rb") as file_loaded:
|
||||||
|
files = {'file': file_loaded}
|
||||||
|
response = requests.post(doc2md_service, files=files)
|
||||||
|
data = response.json()["markdown"]
|
||||||
|
return data
|
||||||
@@ -332,6 +332,9 @@ function Upload({
|
|||||||
],
|
],
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
|
||||||
['.pptx'],
|
['.pptx'],
|
||||||
|
'image/png': ['.png'],
|
||||||
|
'image/jpeg': ['.jpeg'],
|
||||||
|
'image/jpg': ['.jpg'],
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user