fix: remove old pypdf

This commit is contained in:
Alex
2025-02-06 19:59:42 +00:00
parent d819222cf7
commit d47232246a
2 changed files with 10 additions and 9 deletions

View File

@@ -24,26 +24,27 @@ class PDFParser(BaseParser):
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data
try:
import PyPDF2
from pypdf import PdfReader
except ImportError:
raise ValueError("PyPDF2 is required to read PDF files.")
raise ValueError("pypdf is required to read PDF files.")
text_list = []
with open(file, "rb") as fp:
# Create a PDF object
pdf = PyPDF2.PdfReader(fp)
pdf = PdfReader(fp)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
for page in range(num_pages):
for page_index in range(num_pages):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
page = pdf.pages[page_index]
page_text = page.extract_text()
text_list.append(page_text)
text = "\n".join(text_list)
@@ -66,4 +67,4 @@ class DocxParser(BaseParser):
text = docx2txt.process(file)
return text
return text