Bulk ingest

Added a method based on indexGPT folder ingester. Additional rst reader included.
This commit is contained in:
Pavel
2023-02-10 19:44:42 +04:00
parent 5038de06bb
commit 79b5ef9c14
13 changed files with 962 additions and 0 deletions

View File

@@ -0,0 +1,43 @@
"""Epub parser.
Contains parsers for epub files.
"""
from pathlib import Path
from typing import Dict
from parser.file.base_parser import BaseParser
class EpubParser(BaseParser):
"""Epub Parser."""
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
try:
import ebooklib
from ebooklib import epub
except ImportError:
raise ValueError("`EbookLib` is required to read Epub files.")
try:
import html2text
except ImportError:
raise ValueError("`html2text` is required to parse Epub files.")
text_list = []
book = epub.read_epub(file, options={"ignore_ncx": True})
# Iterate through all chapters.
for item in book.get_items():
# Chapters are typically located in epub documents items.
if item.get_type() == ebooklib.ITEM_DOCUMENT:
text_list.append(
html2text.html2text(item.get_content().decode("utf-8"))
)
text = "\n".join(text_list)
return text