mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 16:43:16 +00:00
Linting
* validate python formatting on every build with Ruff * fix lint warnings
This commit is contained in:
@@ -3,7 +3,6 @@ from abc import abstractmethod
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.base import Document
|
||||
|
||||
|
||||
|
||||
@@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
import unstructured
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
@@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
|
||||
# TODO: This value can be a user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of lenth of all the strings in the chunk
|
||||
sum = 0
|
||||
|
||||
@@ -7,8 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
|
||||
@@ -5,10 +5,9 @@ Contains parser for md files.
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
|
||||
|
||||
class RstParser(BaseParser):
|
||||
|
||||
Reference in New Issue
Block a user