feat: image parser

This commit is contained in:
Alex
2024-11-19 19:06:53 +00:00
parent e0a3b8004c
commit 312cb9ae70
6 changed files with 53 additions and 1 deletions

View File

@@ -339,6 +339,9 @@ class UploadFile(Resource):
".json",
".xlsx",
".pptx",
".png",
".jpg",
".jpeg",
],
job_name,
final_filename,
@@ -365,6 +368,9 @@ class UploadFile(Resource):
".json",
".xlsx",
".pptx",
".png",
".jpg",
".jpeg",
],
job_name,
final_filename,

View File

@@ -18,6 +18,7 @@ class Settings(BaseSettings):
DEFAULT_MAX_HISTORY: int = 150
MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
UPLOAD_FOLDER: str = "inputs"
PARSE_PDF_AS_IMAGE: bool = False
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search

View File

@@ -13,6 +13,7 @@ from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
from application.parser.file.image_parser import ImageParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -27,6 +28,9 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
".png": ImageParser(),
".jpg": ImageParser(),
".jpeg": ImageParser(),
}

View File

@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Dict
from application.parser.file.base_parser import BaseParser
from application.core.settings import settings
import requests
class PDFParser(BaseParser):
"""PDF parser."""
@@ -18,6 +19,15 @@ class PDFParser(BaseParser):
def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
if settings.PARSE_PDF_AS_IMAGE:
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data
try:
import PyPDF2
except ImportError:

View File

@@ -0,0 +1,28 @@
"""Image parser.
Contains parser for .png, .jpg, .jpeg files.
"""
from pathlib import Path
import requests
from typing import Dict, Union
import traceback
from application.parser.file.base_parser import BaseParser
class ImageParser(BaseParser):
"""Image parser."""
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data

View File

@@ -332,6 +332,9 @@ function Upload({
],
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
['.pptx'],
'image/png': ['.png'],
'image/jpeg': ['.jpeg'],
'image/jpg': ['.jpg'],
},
});