Implemented html_parser: cleaning & chunk creation

2026-02-17 11:42:22 +00:00 · 2023-02-19 01:53:16 +05:30
parent 12e8ee3088
commit d0b472ad38
2 changed files with 75 additions and 0 deletions
--- a/scripts/parser/file/bulk.py
+++ b/scripts/parser/file/bulk.py
@@ -7,6 +7,7 @@ from parser.file.base import BaseReader
 from parser.file.base_parser import BaseParser
 from parser.file.docs_parser import DocxParser, PDFParser
 from parser.file.epub_parser import EpubParser
+from parser.file.html_parser import HTMLParser
 from parser.file.markdown_parser import MarkdownParser
 from parser.file.rst_parser import RstParser
 from parser.file.tabular_parser import PandasCSVParser
@@ -19,6 +20,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".epub": EpubParser(),
    ".md": MarkdownParser(),
    ".rst": RstParser(),
+    ".html": HTMLParser(),
 }


--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -0,0 +1,73 @@
+"""HTML parser.
+
+Contains parser for html files.
+
+"""
+import re
+from pathlib import Path
+from typing import Dict, Union
+
+from parser.file.base_parser import BaseParser
+
+class HTMLParser(BaseParser):
+    """HTML parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import unstructured
+        except ImportError:
+            raise ValueError("unstructured package is required to parse HTML files.")
+        from unstructured.partition.html import partition_html
+        from unstructured.staging.base import convert_to_isd
+        from unstructured.cleaners.core import clean
+
+        with open(file, "r", encoding="utf-8") as fp:
+            elements = partition_html(file=fp)
+            isd = convert_to_isd(elements)
+
+            # Removing non ascii charactwers from isd_el['text']
+            for isd_el in isd:
+                isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+
+            # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+            # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+            for isd_el in isd:
+                isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+                isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+
+            # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+            for isd_el in isd:
+                clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+
+            # Creating a list of all the indexes of isd_el['type'] = 'Title'
+            title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+
+            # Creating 'Chunks' - List of lists of strings 
+            # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
+            # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+
+            Chunks = list(list())
+
+            for i,isd_el in enumerate(isd):
+                if i in title_indexes:
+                    Chunks.append([])
+                Chunks[-1].append(isd_el['text'])
+
+            print(Chunks)
+
+            # writing the chunks to a file
+            # with open('chunks.txt', 'w') as f:
+                # for chunk in Chunks:
+                    # f.write("%s \n" % chunk)
+
+
+        # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'}         
+        # with open(file, "r", encoding="utf-8") as fp:
+        #     elements = partition_html(file=fp)
+        #     isd = convert_to_isd(elements)
+        #     print(isd)