Compare commits

...

3 Commits

Author SHA1 Message Date
Alex
fcdb4fb5e8 feat: faster ebook parsing 2026-04-09 18:31:06 +01:00
Alex
e787c896eb upd Security.md 2026-04-08 12:49:20 +01:00
Alex
23aeaff5db Merge pull request #2362 from arc53/v1-mini-improvements
feat: history overwrite
2026-04-06 15:02:32 +01:00
4 changed files with 38 additions and 136 deletions

View File

@@ -2,9 +2,7 @@
## Supported Versions
Supported Versions:
Currently, we support security patches by committing changes and bumping the version published on Github.
Security patches target the latest release and the `main` branch. We recommend always running the most recent version.
## Reporting a Vulnerability
@@ -14,7 +12,11 @@ https://github.com/arc53/DocsGPT/security
Then click **Report a vulnerability**.
Alternatively:
Alternatively, email us at: security@arc53.com
security@arc53.com
We aim to acknowledge reports within 48 hours.
## Incident Handling
Arc53 maintains internal incident response procedures. If you believe an active exploit is occurring, include **URGENT** in your report subject line.

View File

@@ -19,25 +19,10 @@ class EpubParser(BaseParser):
def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
try:
import ebooklib
from ebooklib import epub
from fast_ebook import epub
except ImportError:
raise ValueError("`EbookLib` is required to read Epub files.")
try:
import html2text
except ImportError:
raise ValueError("`html2text` is required to parse Epub files.")
raise ValueError("`fast-ebook` is required to read Epub files.")
text_list = []
book = epub.read_epub(file, options={"ignore_ncx": True})
# Iterate through all chapters.
for item in book.get_items():
# Chapters are typically located in epub documents items.
if item.get_type() == ebooklib.ITEM_DOCUMENT:
text_list.append(
html2text.html2text(item.get_content().decode("utf-8"))
)
text = "\n".join(text_list)
book = epub.read_epub(file)
text = book.to_markdown()
return text

View File

@@ -11,7 +11,7 @@ rapidocr>=1.4.0
onnxruntime>=1.19.0
docx2txt==0.9
ddgs>=8.0.0
ebooklib==0.20
fast-ebook
elevenlabs==2.41.0
Flask==3.1.3
faiss-cpu==1.13.2
@@ -23,7 +23,6 @@ google-auth-httplib2==0.3.1
google-auth-oauthlib==1.3.1
gTTS==2.5.4
gunicorn==25.3.0
html2text==2025.4.15
jinja2==3.1.6
jiter==0.13.0
jmespath==1.1.0

View File

@@ -20,133 +20,49 @@ def test_epub_init_parser():
assert parser.parser_config_set
def test_epub_parser_ebooklib_import_error(epub_parser):
"""Test that ImportError is raised when ebooklib is not available."""
with patch.dict(sys.modules, {"ebooklib": None}):
with pytest.raises(ValueError, match="`EbookLib` is required to read Epub files"):
def test_epub_parser_fast_ebook_import_error(epub_parser):
"""Test that ImportError is raised when fast-ebook is not available."""
with patch.dict(sys.modules, {"fast_ebook": None}):
with pytest.raises(ValueError, match="`fast-ebook` is required to read Epub files"):
epub_parser.parse_file(Path("test.epub"))
def test_epub_parser_html2text_import_error(epub_parser):
"""Test that ImportError is raised when html2text is not available."""
fake_ebooklib = types.ModuleType("ebooklib")
fake_epub = types.ModuleType("ebooklib.epub")
fake_ebooklib.epub = fake_epub
with patch.dict(sys.modules, {"ebooklib": fake_ebooklib, "ebooklib.epub": fake_epub}):
with patch.dict(sys.modules, {"html2text": None}):
with pytest.raises(ValueError, match="`html2text` is required to parse Epub files"):
epub_parser.parse_file(Path("test.epub"))
def test_epub_parser_successful_parsing(epub_parser):
"""Test successful parsing of an epub file."""
fake_fast_ebook = types.ModuleType("fast_ebook")
fake_epub = types.ModuleType("fast_ebook.epub")
fake_fast_ebook.epub = fake_epub
fake_ebooklib = types.ModuleType("ebooklib")
fake_epub = types.ModuleType("ebooklib.epub")
fake_html2text = types.ModuleType("html2text")
# Mock ebooklib constants
fake_ebooklib.ITEM_DOCUMENT = "document"
fake_ebooklib.epub = fake_epub
mock_item1 = MagicMock()
mock_item1.get_type.return_value = "document"
mock_item1.get_content.return_value = b"<h1>Chapter 1</h1><p>Content 1</p>"
mock_item2 = MagicMock()
mock_item2.get_type.return_value = "document"
mock_item2.get_content.return_value = b"<h1>Chapter 2</h1><p>Content 2</p>"
mock_item3 = MagicMock()
mock_item3.get_type.return_value = "other" # Should be ignored
mock_item3.get_content.return_value = b"<p>Other content</p>"
mock_book = MagicMock()
mock_book.get_items.return_value = [mock_item1, mock_item2, mock_item3]
mock_book.to_markdown.return_value = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
fake_epub.read_epub = MagicMock(return_value=mock_book)
def mock_html2text_func(html_content):
if "Chapter 1" in html_content:
return "# Chapter 1\n\nContent 1\n"
elif "Chapter 2" in html_content:
return "# Chapter 2\n\nContent 2\n"
return "Other content\n"
fake_html2text.html2text = mock_html2text_func
with patch.dict(sys.modules, {
"ebooklib": fake_ebooklib,
"ebooklib.epub": fake_epub,
"html2text": fake_html2text
"fast_ebook": fake_fast_ebook,
"fast_ebook.epub": fake_epub,
}):
result = epub_parser.parse_file(Path("test.epub"))
expected_result = "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
assert result == expected_result
# Verify epub.read_epub was called with correct parameters
fake_epub.read_epub.assert_called_once_with(Path("test.epub"), options={"ignore_ncx": True})
assert result == "# Chapter 1\n\nContent 1\n\n# Chapter 2\n\nContent 2\n"
fake_epub.read_epub.assert_called_once_with(Path("test.epub"))
def test_epub_parser_empty_book(epub_parser):
"""Test parsing an epub file with no document items."""
# Create mock modules
fake_ebooklib = types.ModuleType("ebooklib")
fake_epub = types.ModuleType("ebooklib.epub")
fake_html2text = types.ModuleType("html2text")
fake_ebooklib.ITEM_DOCUMENT = "document"
fake_ebooklib.epub = fake_epub
# Create mock book with no document items
"""Test parsing an epub file with no content."""
fake_fast_ebook = types.ModuleType("fast_ebook")
fake_epub = types.ModuleType("fast_ebook.epub")
fake_fast_ebook.epub = fake_epub
mock_book = MagicMock()
mock_book.get_items.return_value = []
mock_book.to_markdown.return_value = ""
fake_epub.read_epub = MagicMock(return_value=mock_book)
fake_html2text.html2text = MagicMock()
with patch.dict(sys.modules, {
"ebooklib": fake_ebooklib,
"ebooklib.epub": fake_epub,
"html2text": fake_html2text
"fast_ebook": fake_fast_ebook,
"fast_ebook.epub": fake_epub,
}):
result = epub_parser.parse_file(Path("empty.epub"))
assert result == ""
fake_html2text.html2text.assert_not_called()
def test_epub_parser_non_document_items_ignored(epub_parser):
"""Test that non-document items are ignored during parsing."""
fake_ebooklib = types.ModuleType("ebooklib")
fake_epub = types.ModuleType("ebooklib.epub")
fake_html2text = types.ModuleType("html2text")
fake_ebooklib.ITEM_DOCUMENT = "document"
fake_ebooklib.epub = fake_epub
mock_doc_item = MagicMock()
mock_doc_item.get_type.return_value = "document"
mock_doc_item.get_content.return_value = b"<p>Document content</p>"
mock_other_item = MagicMock()
mock_other_item.get_type.return_value = "image" # Not a document
mock_book = MagicMock()
mock_book.get_items.return_value = [mock_other_item, mock_doc_item]
fake_epub.read_epub = MagicMock(return_value=mock_book)
fake_html2text.html2text = MagicMock(return_value="Document content\n")
with patch.dict(sys.modules, {
"ebooklib": fake_ebooklib,
"ebooklib.epub": fake_epub,
"html2text": fake_html2text
}):
result = epub_parser.parse_file(Path("test.epub"))
assert result == "Document content\n"
fake_html2text.html2text.assert_called_once_with("<p>Document content</p>")