mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
feat: Presentation parser implementation
Signed-off-by: JeevaRamanathan M <jeevaramanathan.m@infosys.com>
This commit is contained in:
@@ -343,6 +343,7 @@ class UploadFile(Resource):
|
||||
".mdx",
|
||||
".json",
|
||||
".xlsx",
|
||||
".pptx",
|
||||
],
|
||||
job_name,
|
||||
final_filename,
|
||||
|
||||
@@ -12,6 +12,7 @@ from application.parser.file.markdown_parser import MarkdownParser
|
||||
from application.parser.file.rst_parser import RstParser
|
||||
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
|
||||
from application.parser.file.json_parser import JSONParser
|
||||
from application.parser.file.pptx_parser import PPTXParser
|
||||
from application.parser.schema.base import Document
|
||||
|
||||
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
@@ -25,6 +26,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".html": HTMLParser(),
|
||||
".mdx": MarkdownParser(),
|
||||
".json":JSONParser(),
|
||||
".pptx":PPTXParser(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
75
application/parser/file/pptx_parser.py
Normal file
75
application/parser/file/pptx_parser.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""PPT parser.
|
||||
Contains parsers for presentation (.pptx) files to extract slide text.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from application.parser.file.base_parser import BaseParser
|
||||
|
||||
class PPTXParser(BaseParser):
|
||||
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
|
||||
Args:
|
||||
concat_slides (bool): Specifies whether to concatenate all slide text into one document.
|
||||
- If True, slide texts will be joined together as a single string.
|
||||
- If False, each slide's text will be stored as a separate entry in a list.
|
||||
Set to True by default.
|
||||
slide_separator (str): Separator used to join slides' text content.
|
||||
Only used when `concat_slides=True`. Default is "\n".
|
||||
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
concat_slides: bool = True,
|
||||
slide_separator: str = "\n",
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._concat_slides = concat_slides
|
||||
self._slide_separator = slide_separator
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
|
||||
r"""
|
||||
Parse a .pptx file and extract text from each slide.
|
||||
Args:
|
||||
file (Path): Path to the .pptx file.
|
||||
errors (str): Error handling policy ('ignore' by default).
|
||||
Returns:
|
||||
Union[str, List[str]]: Concatenated text if concat_slides is True,
|
||||
otherwise a list of slide texts.
|
||||
"""
|
||||
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except ImportError:
|
||||
raise ImportError("pptx module is required to read .PPTX files.")
|
||||
|
||||
try:
|
||||
presentation = Presentation(file)
|
||||
slide_texts=[]
|
||||
|
||||
# Iterate over each slide in the presentation
|
||||
for slide in presentation.slides:
|
||||
slide_text=""
|
||||
|
||||
# Iterate over each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
# Check if the shape has a 'text' attribute and append that to the slide_text
|
||||
if hasattr(shape,"text"):
|
||||
slide_text+=shape.text
|
||||
|
||||
slide_texts.append(slide_text.strip())
|
||||
|
||||
if self._concat_slides:
|
||||
return self._slide_separator.join(slide_texts)
|
||||
else:
|
||||
return slide_texts
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
@@ -14,6 +14,7 @@ esutils==1.0.1
|
||||
Flask==3.0.3
|
||||
faiss-cpu==1.8.0.post1
|
||||
flask-restx==1.3.0
|
||||
gTTS==2.3.2
|
||||
gunicorn==23.0.0
|
||||
html2text==2024.2.26
|
||||
javalang==0.13.0
|
||||
@@ -65,6 +66,7 @@ pymongo==4.8.0
|
||||
pypdf2==3.0.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-pptx==0.4.1
|
||||
qdrant-client==1.11.0
|
||||
redis==5.0.1
|
||||
referencing==0.30.2
|
||||
@@ -84,5 +86,4 @@ urllib3==2.2.3
|
||||
vine==5.1.0
|
||||
wcwidth==0.2.13
|
||||
werkzeug==3.0.4
|
||||
yarl==1.11.1
|
||||
gTTS==2.3.2
|
||||
yarl==1.11.1
|
||||
Reference in New Issue
Block a user