DocsGPT/application/parser/file/pptx_parser.py

"""PPT parser.
Contains parsers for presentation (.pptx) files to extract slide text.
"""
from pathlib import Path
from typing import Any, Dict, List, Union

from application.parser.file.base_parser import BaseParser

class PPTXParser(BaseParser):
    r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
    Args:
        concat_slides (bool): Specifies whether to concatenate all slide text into one document.
            - If True, slide texts will be joined together as a single string.
            - If False, each slide's text will be stored as a separate entry in a list.
            Set to True by default.
        slide_separator (str): Separator used to join slides' text content.
            Only used when `concat_slides=True`. Default is "\n".
        Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
    """

    def __init__(
        self,
        *args: Any,
        concat_slides: bool = True,
        slide_separator: str = "\n",
        **kwargs: Any
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self._concat_slides = concat_slides
        self._slide_separator = slide_separator

    def _init_parser(self) -> Dict:
        """Init parser."""
        return {}

    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
        r"""
        Parse a .pptx file and extract text from each slide.
        Args:
            file (Path): Path to the .pptx file.
            errors (str): Error handling policy ('ignore' by default).
        Returns:
            Union[str, List[str]]: Concatenated text if concat_slides is True,
            otherwise a list of slide texts.
        """

        try:
            from pptx import Presentation
        except ImportError:
            raise ImportError("pptx module is required to read .PPTX files.")

        try:
            presentation = Presentation(file)
            slide_texts=[]

            # Iterate over each slide in the presentation
            for slide in presentation.slides:
                slide_text=""

                # Iterate over each shape in the slide
                for shape in slide.shapes:
                    # Check if the shape has a 'text' attribute and append that to the slide_text
                    if hasattr(shape,"text"):
                        slide_text+=shape.text

                slide_texts.append(slide_text.strip())

            if self._concat_slides:
                return self._slide_separator.join(slide_texts)
            else:
                return slide_texts

        except Exception as e:
            raise e