* validate python formatting on every build with Ruff
* fix lint warnings
This commit is contained in:
Anton Larin
2023-05-13 10:36:17 +02:00
committed by GitHub
parent 168648e789
commit 962becb9a5
35 changed files with 277 additions and 252 deletions

View File

@@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document

View File

@@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
@@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be a user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):

View File

@@ -5,10 +5,9 @@ Contains parser for md files.
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):