From d47232246a9ac49afa8414d4a55519fb85874be6 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 6 Feb 2025 19:59:42 +0000 Subject: [PATCH] fix: remove old pypdf --- application/parser/file/docs_parser.py | 17 +++++++++-------- application/requirements.txt | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py index 55d45a64..a1295290 100644 --- a/application/parser/file/docs_parser.py +++ b/application/parser/file/docs_parser.py @@ -24,26 +24,27 @@ class PDFParser(BaseParser): # alternatively you can use local vision capable LLM with open(file, "rb") as file_loaded: files = {'file': file_loaded} - response = requests.post(doc2md_service, files=files) - data = response.json()["markdown"] + response = requests.post(doc2md_service, files=files) + data = response.json()["markdown"] return data try: - import PyPDF2 + from pypdf import PdfReader except ImportError: - raise ValueError("PyPDF2 is required to read PDF files.") + raise ValueError("pypdf is required to read PDF files.") text_list = [] with open(file, "rb") as fp: # Create a PDF object - pdf = PyPDF2.PdfReader(fp) + pdf = PdfReader(fp) # Get the number of pages in the PDF document num_pages = len(pdf.pages) # Iterate over every page - for page in range(num_pages): + for page_index in range(num_pages): # Extract the text from the page - page_text = pdf.pages[page].extract_text() + page = pdf.pages[page_index] + page_text = page.extract_text() text_list.append(page_text) text = "\n".join(text_list) @@ -66,4 +67,4 @@ class DocxParser(BaseParser): text = docx2txt.process(file) - return text + return text \ No newline at end of file diff --git a/application/requirements.txt b/application/requirements.txt index 12ea4ee5..5732809b 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -66,7 +66,7 @@ pydantic==2.10.4 pydantic-core==2.27.2 pydantic-settings==2.7.1 pymongo==4.10.1 -pypdf2==3.0.1 +pypdf==5.2.0 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-pptx==1.0.2