mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
fix: remove old pypdf
This commit is contained in:
@@ -24,26 +24,27 @@ class PDFParser(BaseParser):
|
||||
# alternatively you can use local vision capable LLM
|
||||
with open(file, "rb") as file_loaded:
|
||||
files = {'file': file_loaded}
|
||||
response = requests.post(doc2md_service, files=files)
|
||||
data = response.json()["markdown"]
|
||||
response = requests.post(doc2md_service, files=files)
|
||||
data = response.json()["markdown"]
|
||||
return data
|
||||
|
||||
try:
|
||||
import PyPDF2
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
raise ValueError("PyPDF2 is required to read PDF files.")
|
||||
raise ValueError("pypdf is required to read PDF files.")
|
||||
text_list = []
|
||||
with open(file, "rb") as fp:
|
||||
# Create a PDF object
|
||||
pdf = PyPDF2.PdfReader(fp)
|
||||
pdf = PdfReader(fp)
|
||||
|
||||
# Get the number of pages in the PDF document
|
||||
num_pages = len(pdf.pages)
|
||||
|
||||
# Iterate over every page
|
||||
for page in range(num_pages):
|
||||
for page_index in range(num_pages):
|
||||
# Extract the text from the page
|
||||
page_text = pdf.pages[page].extract_text()
|
||||
page = pdf.pages[page_index]
|
||||
page_text = page.extract_text()
|
||||
text_list.append(page_text)
|
||||
text = "\n".join(text_list)
|
||||
|
||||
@@ -66,4 +67,4 @@ class DocxParser(BaseParser):
|
||||
|
||||
text = docx2txt.process(file)
|
||||
|
||||
return text
|
||||
return text
|
||||
Reference in New Issue
Block a user