mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
"""PPT parser.
|
|
Contains parsers for presentation (.pptx) files to extract slide text.
|
|
"""
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Union
|
|
|
|
from application.parser.file.base_parser import BaseParser
|
|
|
|
class PPTXParser(BaseParser):
|
|
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
|
|
Args:
|
|
concat_slides (bool): Specifies whether to concatenate all slide text into one document.
|
|
- If True, slide texts will be joined together as a single string.
|
|
- If False, each slide's text will be stored as a separate entry in a list.
|
|
Set to True by default.
|
|
slide_separator (str): Separator used to join slides' text content.
|
|
Only used when `concat_slides=True`. Default is "\n".
|
|
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*args: Any,
|
|
concat_slides: bool = True,
|
|
slide_separator: str = "\n",
|
|
**kwargs: Any
|
|
) -> None:
|
|
"""Init params."""
|
|
super().__init__(*args, **kwargs)
|
|
self._concat_slides = concat_slides
|
|
self._slide_separator = slide_separator
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Init parser."""
|
|
return {}
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
|
r"""
|
|
Parse a .pptx file and extract text from each slide.
|
|
Args:
|
|
file (Path): Path to the .pptx file.
|
|
errors (str): Error handling policy ('ignore' by default).
|
|
Returns:
|
|
Union[str, List[str]]: Concatenated text if concat_slides is True,
|
|
otherwise a list of slide texts.
|
|
"""
|
|
|
|
try:
|
|
from pptx import Presentation
|
|
except ImportError:
|
|
raise ImportError("pptx module is required to read .PPTX files.")
|
|
|
|
try:
|
|
presentation = Presentation(file)
|
|
slide_texts=[]
|
|
|
|
# Iterate over each slide in the presentation
|
|
for slide in presentation.slides:
|
|
slide_text=""
|
|
|
|
# Iterate over each shape in the slide
|
|
for shape in slide.shapes:
|
|
# Check if the shape has a 'text' attribute and append that to the slide_text
|
|
if hasattr(shape,"text"):
|
|
slide_text+=shape.text
|
|
|
|
slide_texts.append(slide_text.strip())
|
|
|
|
if self._concat_slides:
|
|
return self._slide_separator.join(slide_texts)
|
|
else:
|
|
return slide_texts
|
|
|
|
except Exception as e:
|
|
raise e |