feat: Presentation parser implementation

Signed-off-by: JeevaRamanathan M <jeevaramanathan.m@infosys.com>
2026-05-21 21:05:05 +00:00 · 2024-10-31 11:47:12 +00:00
parent 45e14bc2f5
commit 5c756348a5
10 changed files with 88 additions and 7 deletions
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -343,6 +343,7 @@ class UploadFile(Resource):
                        ".mdx",
                        ".json",
                        ".xlsx",
+                        ".pptx",
                    ],
                    job_name,
                    final_filename,
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -12,6 +12,7 @@ from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
 from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
 from application.parser.file.json_parser import JSONParser
+from application.parser.file.pptx_parser import PPTXParser
 from application.parser.schema.base import Document

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -25,6 +26,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".html": HTMLParser(),
    ".mdx": MarkdownParser(),
    ".json":JSONParser(),
+    ".pptx":PPTXParser(),
 }


--- a/application/parser/file/pptx_parser.py
+++ b/application/parser/file/pptx_parser.py
@@ -0,0 +1,75 @@
+"""PPT parser.
+Contains parsers for presentation (.pptx) files to extract slide text.
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+from application.parser.file.base_parser import BaseParser
+
+class PPTXParser(BaseParser):
+    r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
+    Args:
+        concat_slides (bool): Specifies whether to concatenate all slide text into one document.
+            - If True, slide texts will be joined together as a single string.
+            - If False, each slide's text will be stored as a separate entry in a list.
+            Set to True by default.
+        slide_separator (str): Separator used to join slides' text content.
+            Only used when `concat_slides=True`. Default is "\n".
+        Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        concat_slides: bool = True,
+        slide_separator: str = "\n",
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_slides = concat_slides
+        self._slide_separator = slide_separator
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        r"""
+        Parse a .pptx file and extract text from each slide.
+        Args:
+            file (Path): Path to the .pptx file.
+            errors (str): Error handling policy ('ignore' by default).
+        Returns:
+            Union[str, List[str]]: Concatenated text if concat_slides is True,
+            otherwise a list of slide texts.
+        """
+
+        try:
+            from pptx import Presentation
+        except ImportError:
+            raise ImportError("pptx module is required to read .PPTX files.")
+
+        try:
+            presentation = Presentation(file)
+            slide_texts=[]
+
+            # Iterate over each slide in the presentation
+            for slide in presentation.slides:
+                slide_text=""
+
+                # Iterate over each shape in the slide
+                for shape in slide.shapes:
+                    # Check if the shape has a 'text' attribute and append that to the slide_text
+                    if hasattr(shape,"text"):
+                        slide_text+=shape.text
+
+                slide_texts.append(slide_text.strip())
+
+            if self._concat_slides:
+                return self._slide_separator.join(slide_texts)
+            else:
+                return slide_texts
+
+        except Exception as e:
+            raise e
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -14,6 +14,7 @@ esutils==1.0.1
 Flask==3.0.3
 faiss-cpu==1.8.0.post1
 flask-restx==1.3.0
+gTTS==2.3.2
 gunicorn==23.0.0
 html2text==2024.2.26
 javalang==0.13.0
@@ -65,6 +66,7 @@ pymongo==4.8.0
 pypdf2==3.0.1
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
+python-pptx==0.4.1
 qdrant-client==1.11.0
 redis==5.0.1
 referencing==0.30.2
@@ -84,5 +86,4 @@ urllib3==2.2.3
 vine==5.1.0
 wcwidth==0.2.13
 werkzeug==3.0.4
-yarl==1.11.1
-gTTS==2.3.2
+yarl==1.11.1
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -86,7 +86,7 @@
      "start": "Start Chatting",
      "name": "Name",
      "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limited to 25mb",
      "uploadedFiles": "Uploaded Files",
      "cancel": "Cancel",
      "train": "Train",
--- a/frontend/src/locale/es.json
+++ b/frontend/src/locale/es.json
@@ -86,7 +86,7 @@
      "start": "Empezar a chatear",
      "name": "Nombre",
      "choose": "Seleccionar Archivos",
-      "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB",
+      "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limitados a 25 MB",
      "uploadedFiles": "Archivos Subidos",
      "cancel": "Cancelar",
      "train": "Entrenar",
--- a/frontend/src/locale/jp.json
+++ b/frontend/src/locale/jp.json
@@ -86,7 +86,7 @@
      "start": "チャットを開始する",
      "name": "名前",
      "choose": "ファイルを選択",
-      "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください",
+      "info": ".pdf, .txt, .rst, .docx, .md, .json, .pptx, .zipファイルを25MBまでアップロードしてください",
      "uploadedFiles": "アップロードされたファイル",
      "cancel": "キャンセル",
      "train": "トレーニング",
--- a/frontend/src/locale/zh-TW.json
+++ b/frontend/src/locale/zh-TW.json
@@ -80,7 +80,7 @@
      "remote": "遠端",
      "name": "名稱",
      "choose": "選擇檔案",
-      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案，大小限制為 25MB",
+      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .pptx, .zip 檔案，大小限制為 25MB",
      "uploadedFiles": "已上傳的檔案",
      "cancel": "取消",
      "train": "訓練",
--- a/frontend/src/locale/zh.json
+++ b/frontend/src/locale/zh.json
@@ -86,7 +86,7 @@
      "start": "开始聊天",
      "name": "名称",
      "choose": "选择文件",
-      "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件，限 25MB",
+      "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip 文件，限 25MB",
      "uploadedFiles": "已上传文件",
      "cancel": "取消",
      "train": "训练",
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -321,6 +321,8 @@ function Upload({
      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [
        '.xlsx',
      ],
+      'application/vnd.openxmlformats-officedocument.presentationml.presentation':
+        ['.pptx'],
    },
  });