From da5d62cc1caaa1fb46fc211d5c1a03a95797fd52 Mon Sep 17 00:00:00 2001 From: Nazih Kalo Date: Fri, 19 May 2023 10:29:18 -0700 Subject: [PATCH] updating the bulk ingest file metadata to account for parsers that output lists --- scripts/parser/file/bulk.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/parser/file/bulk.py b/scripts/parser/file/bulk.py index 8a963104..8b5bd406 100644 --- a/scripts/parser/file/bulk.py +++ b/scripts/parser/file/bulk.py @@ -1,8 +1,5 @@ """Simple reader that reads files of different formats from a directory.""" import logging -from pathlib import Path -from typing import Callable, Dict, List, Optional, Union - from parser.file.base import BaseReader from parser.file.base_parser import BaseParser from parser.file.docs_parser import DocxParser, PDFParser @@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser from parser.file.rst_parser import RstParser from parser.file.tabular_parser import PandasCSVParser from parser.schema.base import Document +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), @@ -151,10 +150,15 @@ class SimpleDirectoryReader(BaseReader): data = f.read() if isinstance(data, List): data_list.extend(data) + if self.file_metadata is not None: + for _ in range(len(data)): + metadata_list.append(self.file_metadata(str(input_file))) else: data_list.append(str(data)) - if self.file_metadata is not None: - metadata_list.append(self.file_metadata(str(input_file))) + if self.file_metadata is not None: + metadata_list.append(self.file_metadata(str(input_file))) + + if concatenate: return [Document("\n".join(data_list))]