Merge pull request #243 from nazihkalo/main

updating the bulk ingest file metadata logic
This commit is contained in:
Alex
2023-05-20 16:02:32 +01:00
committed by GitHub

View File

@@ -1,8 +1,5 @@
"""Simple reader that reads files of different formats from a directory."""
import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
@@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
@@ -151,10 +150,15 @@ class SimpleDirectoryReader(BaseReader):
data = f.read()
if isinstance(data, List):
data_list.extend(data)
if self.file_metadata is not None:
for _ in range(len(data)):
metadata_list.append(self.file_metadata(str(input_file)))
else:
data_list.append(str(data))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if concatenate:
return [Document("\n".join(data_list))]