feat: image parser

This commit is contained in:
Alex
2024-11-19 19:06:53 +00:00
parent e0a3b8004c
commit 312cb9ae70
6 changed files with 53 additions and 1 deletions

View File

@@ -13,6 +13,7 @@ from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
from application.parser.file.image_parser import ImageParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -27,6 +28,9 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
".png": ImageParser(),
".jpg": ImageParser(),
".jpeg": ImageParser(),
}

View File

@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Dict
from application.parser.file.base_parser import BaseParser
from application.core.settings import settings
import requests
class PDFParser(BaseParser):
"""PDF parser."""
@@ -18,6 +19,15 @@ class PDFParser(BaseParser):
def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
if settings.PARSE_PDF_AS_IMAGE:
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data
try:
import PyPDF2
except ImportError:

View File

@@ -0,0 +1,28 @@
"""Image parser.
Contains parser for .png, .jpg, .jpeg files.
"""
from pathlib import Path
import requests
from typing import Dict, Union
import traceback
from application.parser.file.base_parser import BaseParser
class ImageParser(BaseParser):
"""Image parser."""
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data