Files
DocsGPT/application/parser/file/pptx_parser.py
JeevaRamanathan M 5c756348a5 feat: Presentation parser implementation
Signed-off-by: JeevaRamanathan M <jeevaramanathan.m@infosys.com>
2024-10-31 11:47:12 +00:00

75 lines
2.6 KiB
Python

"""PPT parser.
Contains parsers for presentation (.pptx) files to extract slide text.
"""
from pathlib import Path
from typing import Any, Dict, List, Union
from application.parser.file.base_parser import BaseParser
class PPTXParser(BaseParser):
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
Args:
concat_slides (bool): Specifies whether to concatenate all slide text into one document.
- If True, slide texts will be joined together as a single string.
- If False, each slide's text will be stored as a separate entry in a list.
Set to True by default.
slide_separator (str): Separator used to join slides' text content.
Only used when `concat_slides=True`. Default is "\n".
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
"""
def __init__(
self,
*args: Any,
concat_slides: bool = True,
slide_separator: str = "\n",
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_slides = concat_slides
self._slide_separator = slide_separator
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
r"""
Parse a .pptx file and extract text from each slide.
Args:
file (Path): Path to the .pptx file.
errors (str): Error handling policy ('ignore' by default).
Returns:
Union[str, List[str]]: Concatenated text if concat_slides is True,
otherwise a list of slide texts.
"""
try:
from pptx import Presentation
except ImportError:
raise ImportError("pptx module is required to read .PPTX files.")
try:
presentation = Presentation(file)
slide_texts=[]
# Iterate over each slide in the presentation
for slide in presentation.slides:
slide_text=""
# Iterate over each shape in the slide
for shape in slide.shapes:
# Check if the shape has a 'text' attribute and append that to the slide_text
if hasattr(shape,"text"):
slide_text+=shape.text
slide_texts.append(slide_text.strip())
if self._concat_slides:
return self._slide_separator.join(slide_texts)
else:
return slide_texts
except Exception as e:
raise e