mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-30 00:53:14 +00:00
uploads backend first
This commit is contained in:
59
application/parser/file/docs_parser.py
Normal file
59
application/parser/file/docs_parser.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Docs parser.
|
||||
|
||||
Contains parsers for docx, pdf files.
|
||||
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class PDFParser(BaseParser):
|
||||
"""PDF parser."""
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||||
"""Parse file."""
|
||||
try:
|
||||
import PyPDF2
|
||||
except ImportError:
|
||||
raise ValueError("PyPDF2 is required to read PDF files.")
|
||||
text_list = []
|
||||
with open(file, "rb") as fp:
|
||||
# Create a PDF object
|
||||
pdf = PyPDF2.PdfReader(fp)
|
||||
|
||||
# Get the number of pages in the PDF document
|
||||
num_pages = len(pdf.pages)
|
||||
|
||||
# Iterate over every page
|
||||
for page in range(num_pages):
|
||||
# Extract the text from the page
|
||||
page_text = pdf.pages[page].extract_text()
|
||||
text_list.append(page_text)
|
||||
text = "\n".join(text_list)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
class DocxParser(BaseParser):
|
||||
"""Docx parser."""
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||||
"""Parse file."""
|
||||
try:
|
||||
import docx2txt
|
||||
except ImportError:
|
||||
raise ValueError("docx2txt is required to read Microsoft Word files.")
|
||||
|
||||
text = docx2txt.process(file)
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user