Added HTML Support. read, clean-up, filter return

2026-01-21 06:20:34 +00:00 · 2023-02-21 23:06:00 +05:30
parent e8baa46eb6
commit 16eb503e36
2 changed files with 47 additions and 38 deletions
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -16,8 +16,12 @@ class HTMLParser(BaseParser):
        """Init parser."""
        return {}

-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+        """Parse file.
+
+            Returns:
+            Union[str, List[str]]: a string or a List of strings.
+        """
        try:
            import unstructured
        except ImportError:
@@ -26,48 +30,53 @@ class HTMLParser(BaseParser):
        from unstructured.staging.base import convert_to_isd
        from unstructured.cleaners.core import clean

+        # Using the unstructured library to convert the html to isd format
+        # isd sample : isd = [
+                            #   {"text": "My Title", "type": "Title"},
+                            #   {"text": "My Narrative", "type": "NarrativeText"}
+                            # ]
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
+            isd = convert_to_isd(elements)  

-            # Removing non ascii charactwers from isd_el['text']
-            for isd_el in isd:
-                isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+        # Removing non ascii charactwers from isd_el['text']
+        for isd_el in isd:
+            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

-            # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-            # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-            for isd_el in isd:
-                isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
-                isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+        for isd_el in isd:
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)

-            # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-            for isd_el in isd:
-                clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+        for isd_el in isd:
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )

-            # Creating a list of all the indexes of isd_el['type'] = 'Title'
-            title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+        # Creating a list of all the indexes of isd_el['type'] = 'Title'
+        title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']

-            # Creating 'Chunks' - List of lists of strings 
-            # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
-            # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+        # Creating 'Chunks' - List of lists of strings 
+        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
+        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+        # Where Each Title is grouped together with the data under it

-            Chunks = list(list())
+        Chunks = list(list())
+        final_chunks = list(list())

-            for i,isd_el in enumerate(isd):
-                if i in title_indexes:
-                    Chunks.append([])
-                Chunks[-1].append(isd_el['text'])
+        for i,isd_el in enumerate(isd):
+            if i in title_indexes:
+                Chunks.append([])
+            Chunks[-1].append(isd_el['text'])

-            print(Chunks)
-
-            # writing the chunks to a file
-            # with open('chunks.txt', 'w') as f:
-                # for chunk in Chunks:
-                    # f.write("%s \n" % chunk)
-
-
-        # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'}         
-        # with open(file, "r", encoding="utf-8") as fp:
-        #     elements = partition_html(file=fp)
-        #     isd = convert_to_isd(elements)
-        #     print(isd)
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        for chunk in Chunks:
+            # sum of lenth of all the strings in the chunk
+            sum = 0
+            sum += len(str(chunk))
+            if sum < 25:
+                Chunks.remove(chunk)
+            else :         
+                # appending all the approved chunks to final_chunks as a single string       
+                final_chunks.append(" ".join([str(item) for item in chunk]))
+        return final_chunks