Proper PEP8 formatting

This commit is contained in:
Anton Larin
2023-05-12 12:02:25 +02:00
parent 7f56f57778
commit 168648e789
14 changed files with 139 additions and 123 deletions

View File

@@ -9,6 +9,7 @@ from typing import Dict, Union
from parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):
"""HTML parser."""
@@ -32,12 +33,12 @@ class HTMLParser(BaseParser):
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
isd = convert_to_isd(elements)
# Removing non ascii charactwers from isd_el['text']
for isd_el in isd:
@@ -46,15 +47,15 @@ class HTMLParser(BaseParser):
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@@ -64,7 +65,7 @@ class HTMLParser(BaseParser):
Chunks = [[]]
final_chunks = list(list())
for i,isd_el in enumerate(isd):
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])
@@ -76,7 +77,7 @@ class HTMLParser(BaseParser):
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
else:
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks