uploads backend first

2026-02-12 01:01:37 +00:00 · 2023-03-13 14:20:03 +00:00
parent a44cde33ed
commit 1d2162705d
25 changed files with 1600 additions and 18 deletions
--- a/application/parser/file/docs_parser.py
+++ b/application/parser/file/docs_parser.py
@@ -0,0 +1,59 @@
+"""Docs parser.
+
+Contains parsers for docx, pdf files.
+
+"""
+from pathlib import Path
+from typing import Dict
+
+from parser.file.base_parser import BaseParser
+
+
+class PDFParser(BaseParser):
+    """PDF parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ValueError("PyPDF2 is required to read PDF files.")
+        text_list = []
+        with open(file, "rb") as fp:
+            # Create a PDF object
+            pdf = PyPDF2.PdfReader(fp)
+
+            # Get the number of pages in the PDF document
+            num_pages = len(pdf.pages)
+
+            # Iterate over every page
+            for page in range(num_pages):
+                # Extract the text from the page
+                page_text = pdf.pages[page].extract_text()
+                text_list.append(page_text)
+        text = "\n".join(text_list)
+
+        return text
+
+
+class DocxParser(BaseParser):
+    """Docx parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import docx2txt
+        except ImportError:
+            raise ValueError("docx2txt is required to read Microsoft Word files.")
+
+        text = docx2txt.process(file)
+
+        return text