Proper PEP8 formatting

This commit is contained in:
Anton Larin
2023-05-12 12:02:25 +02:00
parent 7f56f57778
commit 168648e789
14 changed files with 139 additions and 123 deletions

View File

@@ -52,17 +52,17 @@ class SimpleDirectoryReader(BaseReader):
"""
def __init__(
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
chunk_size_max: int = 2048,
) -> None:
"""Initialize with parameters."""
super().__init__()
@@ -103,8 +103,8 @@ class SimpleDirectoryReader(BaseReader):
elif self.exclude_hidden and input_file.name.startswith("."):
continue
elif (
self.required_exts is not None
and input_file.suffix not in self.required_exts
self.required_exts is not None
and input_file.suffix not in self.required_exts
):
continue
else:
@@ -115,7 +115,7 @@ class SimpleDirectoryReader(BaseReader):
new_input_files.extend(sub_input_files)
if self.num_files_limit is not None and self.num_files_limit > 0:
new_input_files = new_input_files[0 : self.num_files_limit]
new_input_files = new_input_files[0: self.num_files_limit]
# print total number of files added
logging.debug(

View File

@@ -9,6 +9,7 @@ from typing import Dict, Union
from parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):
"""HTML parser."""
@@ -32,12 +33,12 @@ class HTMLParser(BaseParser):
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
isd = convert_to_isd(elements)
# Removing non ascii charactwers from isd_el['text']
for isd_el in isd:
@@ -46,15 +47,15 @@ class HTMLParser(BaseParser):
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@@ -64,7 +65,7 @@ class HTMLParser(BaseParser):
Chunks = [[]]
final_chunks = list(list())
for i,isd_el in enumerate(isd):
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
@@ -76,7 +77,7 @@ class HTMLParser(BaseParser):
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
else:
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks

View File

@@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
self._max_tokens = max_tokens
# self._remove_tables = remove_tables
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
current_text: str):
"""Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
if num_tokens > self._max_tokens:
@@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
else:
tups.append((current_header, current_text))
return tups
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.
@@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
return markdown_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):
"""reStructuredText parser.
@@ -19,17 +20,17 @@ class RstParser(BaseParser):
"""
def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
# Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
@@ -41,7 +42,6 @@ class RstParser(BaseParser):
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a reStructuredText file to a dictionary.
@@ -56,7 +56,8 @@ class RstParser(BaseParser):
for i, line in enumerate(lines):
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if header_match and i > 0 and (
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if current_header is not None:
if current_text == "" or None:
continue
@@ -72,7 +73,7 @@ class RstParser(BaseParser):
rst_tups.append((current_header, current_text))
#TODO: Format for rst
# TODO: Format for rst
#
# if current_header is not None:
# # pass linting, assert keys are defined
@@ -136,7 +137,7 @@ class RstParser(BaseParser):
return {}
def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
@@ -159,7 +160,7 @@ class RstParser(BaseParser):
return rst_tups
def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)

View File

@@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)