mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-30 09:03:15 +00:00
Proper PEP8 formatting
This commit is contained in:
@@ -9,6 +9,7 @@ from typing import Dict, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class HTMLParser(BaseParser):
|
||||
"""HTML parser."""
|
||||
|
||||
@@ -32,12 +33,12 @@ class HTMLParser(BaseParser):
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
with open(file, "r", encoding="utf-8") as fp:
|
||||
elements = partition_html(file=fp)
|
||||
isd = convert_to_isd(elements)
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
for isd_el in isd:
|
||||
@@ -46,15 +47,15 @@ class HTMLParser(BaseParser):
|
||||
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||||
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
|
||||
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||||
for isd_el in isd:
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
|
||||
|
||||
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||||
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
|
||||
# Creating 'Chunks' - List of lists of strings
|
||||
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||||
@@ -64,7 +65,7 @@ class HTMLParser(BaseParser):
|
||||
Chunks = [[]]
|
||||
final_chunks = list(list())
|
||||
|
||||
for i,isd_el in enumerate(isd):
|
||||
for i, isd_el in enumerate(isd):
|
||||
if i in title_indexes:
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
@@ -76,7 +77,7 @@ class HTMLParser(BaseParser):
|
||||
sum += len(str(chunk))
|
||||
if sum < 25:
|
||||
Chunks.remove(chunk)
|
||||
else :
|
||||
else:
|
||||
# appending all the approved chunks to final_chunks as a single string
|
||||
final_chunks.append(" ".join([str(item) for item in chunk]))
|
||||
return final_chunks
|
||||
|
||||
Reference in New Issue
Block a user