move folder

2026-02-08 23:30:42 +00:00 · 2023-02-10 16:10:53 +00:00
parent 8c4fcff617
commit d642782a5a
12 changed files with 0 additions and 0 deletions
--- a/scripts/parser/file/tabular_parser.py
+++ b/scripts/parser/file/tabular_parser.py
@@ -0,0 +1,115 @@
+"""Tabular parser.
+
+Contains parsers for tabular data files.
+
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+from parser.file.base_parser import BaseParser
+
+
+class CSVParser(BaseParser):
+    """CSV parser.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+    """
+
+    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file.
+
+        Returns:
+            Union[str, List[str]]: a string or a List of strings.
+
+        """
+        try:
+            import csv
+        except ImportError:
+            raise ValueError("csv module is required to read CSV files.")
+        text_list = []
+        with open(file, "r") as fp:
+            csv_reader = csv.reader(fp)
+            for row in csv_reader:
+                text_list.append(", ".join(row))
+        if self._concat_rows:
+            return "\n".join(text_list)
+        else:
+            return text_list
+
+
+class PandasCSVParser(BaseParser):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        col_joiner (str): Separator to use for joining cols per row.
+            Set to ", " by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_csv` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the separators, table head, etc. on its own.
+
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        concat_rows: bool = True,
+        col_joiner: str = ", ",
+        row_joiner: str = "\n",
+        pandas_config: dict = {},
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._col_joiner = col_joiner
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse file."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError("pandas module is required to read CSV files.")
+
+        df = pd.read_csv(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return (self._row_joiner).join(text_list)
+        else:
+            return text_list