From a4c0861cf4b79d08338147664531a0dae1f38ce7 Mon Sep 17 00:00:00 2001 From: "devendra.parihar" Date: Fri, 18 Oct 2024 12:07:44 +0530 Subject: [PATCH 1/4] fix:GitHubLoader to Handle Binary Files --- application/parser/remote/drive_loader.py | 65 ++++++++++++++++++++++ application/parser/remote/github_loader.py | 19 +++++-- 2 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 application/parser/remote/drive_loader.py diff --git a/application/parser/remote/drive_loader.py b/application/parser/remote/drive_loader.py new file mode 100644 index 00000000..5931e5aa --- /dev/null +++ b/application/parser/remote/drive_loader.py @@ -0,0 +1,65 @@ +import os +from typing import List +from google.oauth2.credentials import Credentials +from googleapiclient.discovery import build +from googleapiclient.http import MediaIoBaseDownload +from google.auth.transport.requests import Request +from io import BytesIO +from langchain_core.documents import Document +from application.parser.remote.base import BaseRemote + +SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] + +class GoogleDriveLoader(BaseRemote): + def __init__(self, token_path: str, credentials_path: str): + # Load OAuth2 credentials from token and credentials JSON files + self.creds = None + if os.path.exists(token_path): + self.creds = Credentials.from_authorized_user_file(token_path, SCOPES) + if not self.creds or not self.creds.valid: + if self.creds and self.creds.expired and self.creds.refresh_token: + self.creds.refresh(Request()) + else: + raise Exception("Invalid or missing credentials. Please authenticate.") + + # Initialize the Google Drive API client + self.service = build('drive', 'v3', credentials=self.creds) + + def fetch_file_content(self, file_id: str) -> str: + request = self.service.files().get_media(fileId=file_id) + file_io = BytesIO() + downloader = MediaIoBaseDownload(file_io, request) + + done = False + while not done: + status, done = downloader.next_chunk() + + file_io.seek(0) + return file_io.read().decode("utf-8") + + def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]: + query = f"'{folder_id}' in parents" + if mime_type_filter: + mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter]) + query += f" and ({mime_types_query})" + + results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute() + return results.get('files', []) + + def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]: + # Fetch the list of files within the specified folder + files = self.fetch_drive_files(folder_id, mime_type_filter) + documents = [] + + # Loop over each file, download its content, and convert it into a document + for file in files: + if file['mimeType'] != 'application/vnd.google-apps.folder': + try: + content = self.fetch_file_content(file['id']) + documents.append(Document(page_content=content, metadata={ + "title": file['name'], + "source": f"https://drive.google.com/file/d/{file['id']}/view" + })) + except Exception as e: + print(f"Failed to load file {file['name']}: {e}") + return documents diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 49f0ae9c..95c04102 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -3,6 +3,7 @@ import requests from typing import List from application.parser.remote.base import BaseRemote from langchain_core.documents import Document +import mimetypes class GitHubLoader(BaseRemote): def __init__(self): @@ -18,13 +19,19 @@ class GitHubLoader(BaseRemote): if response.status_code == 200: content = response.json() + mime_type, _ = mimetypes.guess_type(file_path) # Guess the MIME type based on the file extension + if content.get("encoding") == "base64": - try: - decoded_content = base64.b64decode(content["content"]).decode("utf-8") - return f"Filename: {file_path}\n\n{decoded_content}" - except Exception as e: - print(f"Error decoding content for {file_path}: {e}") - raise + if mime_type and mime_type.startswith("text"): # Handle only text files + try: + decoded_content = base64.b64decode(content["content"]).decode("utf-8") + return f"Filename: {file_path}\n\n{decoded_content}" + except Exception as e: + # print(f"Error decoding content for {file_path}: {e}") + raise + else: + # print(f"Skipping binary file: {file_path} (MIME type: {mime_type})") + return f"Filename: {file_path} is a binary file and was skipped." else: return f"Filename: {file_path}\n\n{content['content']}" else: From 09a27053117a0f55450ab7734f573378b8e360a0 Mon Sep 17 00:00:00 2001 From: "devendra.parihar" Date: Fri, 18 Oct 2024 12:08:08 +0530 Subject: [PATCH 2/4] fix:GitHubLoader to Handle Binary Files --- application/parser/remote/drive_loader.py | 65 ----------------------- 1 file changed, 65 deletions(-) delete mode 100644 application/parser/remote/drive_loader.py diff --git a/application/parser/remote/drive_loader.py b/application/parser/remote/drive_loader.py deleted file mode 100644 index 5931e5aa..00000000 --- a/application/parser/remote/drive_loader.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -from typing import List -from google.oauth2.credentials import Credentials -from googleapiclient.discovery import build -from googleapiclient.http import MediaIoBaseDownload -from google.auth.transport.requests import Request -from io import BytesIO -from langchain_core.documents import Document -from application.parser.remote.base import BaseRemote - -SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] - -class GoogleDriveLoader(BaseRemote): - def __init__(self, token_path: str, credentials_path: str): - # Load OAuth2 credentials from token and credentials JSON files - self.creds = None - if os.path.exists(token_path): - self.creds = Credentials.from_authorized_user_file(token_path, SCOPES) - if not self.creds or not self.creds.valid: - if self.creds and self.creds.expired and self.creds.refresh_token: - self.creds.refresh(Request()) - else: - raise Exception("Invalid or missing credentials. Please authenticate.") - - # Initialize the Google Drive API client - self.service = build('drive', 'v3', credentials=self.creds) - - def fetch_file_content(self, file_id: str) -> str: - request = self.service.files().get_media(fileId=file_id) - file_io = BytesIO() - downloader = MediaIoBaseDownload(file_io, request) - - done = False - while not done: - status, done = downloader.next_chunk() - - file_io.seek(0) - return file_io.read().decode("utf-8") - - def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]: - query = f"'{folder_id}' in parents" - if mime_type_filter: - mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter]) - query += f" and ({mime_types_query})" - - results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute() - return results.get('files', []) - - def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]: - # Fetch the list of files within the specified folder - files = self.fetch_drive_files(folder_id, mime_type_filter) - documents = [] - - # Loop over each file, download its content, and convert it into a document - for file in files: - if file['mimeType'] != 'application/vnd.google-apps.folder': - try: - content = self.fetch_file_content(file['id']) - documents.append(Document(page_content=content, metadata={ - "title": file['name'], - "source": f"https://drive.google.com/file/d/{file['id']}/view" - })) - except Exception as e: - print(f"Failed to load file {file['name']}: {e}") - return documents From d3238de8abd51252e84ce3d6b6a2c131bab9b07d Mon Sep 17 00:00:00 2001 From: "devendra.parihar" Date: Fri, 18 Oct 2024 12:23:17 +0530 Subject: [PATCH 3/4] fix: lint error --- application/parser/remote/github_loader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py index 95c04102..8f805056 100644 --- a/application/parser/remote/github_loader.py +++ b/application/parser/remote/github_loader.py @@ -27,10 +27,8 @@ class GitHubLoader(BaseRemote): decoded_content = base64.b64decode(content["content"]).decode("utf-8") return f"Filename: {file_path}\n\n{decoded_content}" except Exception as e: - # print(f"Error decoding content for {file_path}: {e}") - raise + raise e else: - # print(f"Skipping binary file: {file_path} (MIME type: {mime_type})") return f"Filename: {file_path} is a binary file and was skipped." else: return f"Filename: {file_path}\n\n{content['content']}" From c77d415893bdb227224bb21e6e145f5c2418b93e Mon Sep 17 00:00:00 2001 From: JeevaRamanathan M Date: Thu, 24 Oct 2024 20:36:47 +0000 Subject: [PATCH 4/4] feat: JSON parser implementation Signed-off-by: JeevaRamanathan M --- application/api/user/routes.py | 1 + application/parser/file/bulk.py | 2 + application/parser/file/json_parser.py | 57 ++++++++++++++++++++++++++ frontend/src/index.css | 2 - frontend/src/locale/en.json | 2 +- frontend/src/locale/es.json | 2 +- frontend/src/locale/jp.json | 2 +- frontend/src/locale/zh-TW.json | 2 +- frontend/src/locale/zh.json | 2 +- frontend/src/upload/Upload.tsx | 1 + 10 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 application/parser/file/json_parser.py diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 2ead8ef1..b476345d 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -340,6 +340,7 @@ class UploadFile(Resource): ".epub", ".html", ".mdx", + ".json" ], job_name, final_filename, diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 79fc2c45..bb63aa61 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -11,6 +11,7 @@ from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser +from application.parser.file.json_parser import JSONParser from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { @@ -23,6 +24,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".rst": RstParser(), ".html": HTMLParser(), ".mdx": MarkdownParser(), + ".json":JSONParser(), } diff --git a/application/parser/file/json_parser.py b/application/parser/file/json_parser.py new file mode 100644 index 00000000..0201b420 --- /dev/null +++ b/application/parser/file/json_parser.py @@ -0,0 +1,57 @@ +import json +from typing import Any, Dict, List, Union +from pathlib import Path + +from application.parser.file.base_parser import BaseParser + +class JSONParser(BaseParser): + r"""JSON (.json) parser. + + Parses JSON files into a list of strings or a concatenated document. + It handles both JSON objects (dictionaries) and arrays (lists). + + Args: + concat_rows (bool): Whether to concatenate all rows into one document. + If set to False, a Document will be created for each item in the JSON. + True by default. + + row_joiner (str): Separator to use for joining each row. + Only used when `concat_rows=True`. + Set to "\n" by default. + + json_config (dict): Options for parsing JSON. Can be used to specify options like + custom decoding or formatting. Set to empty dict by default. + + """ + + def __init__( + self, + *args: Any, + concat_rows: bool = True, + row_joiner: str = "\n", + json_config: dict = {}, + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._row_joiner = row_joiner + self._json_config = json_config + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse JSON file.""" + + with open(file, 'r', encoding='utf-8') as f: + data = json.load(f, **self._json_config) + + if isinstance(data, dict): + data = [data] + + if self._concat_rows: + return self._row_joiner.join([str(item) for item in data]) + else: + return data diff --git a/frontend/src/index.css b/frontend/src/index.css index 1eca983c..9b87724a 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -67,10 +67,8 @@ body.dark { .table-default td:last-child { @apply border-r-0; /* Ensure no right border on the last column */ } - } - /*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */ /* Document diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index cefb99b7..ec77b7e0 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -86,7 +86,7 @@ "start": "Start Chatting", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .json, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index 66b457e8..9ee7d566 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -86,7 +86,7 @@ "start": "Empezar a chatear", "name": "Nombre", "choose": "Seleccionar Archivos", - "info": "Por favor, suba archivos .pdf, .txt, .rst, .docx, .md, .zip limitados a 25 MB", + "info": "Por favor, suba archivos .pdf, .txt, .rst, .docx, .md, .json, .zip limitados a 25 MB", "uploadedFiles": "Archivos Subidos", "cancel": "Cancelar", "train": "Entrenar", diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index 53f1da14..841a477b 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -86,7 +86,7 @@ "start": "チャットを開始する", "name": "名前", "choose": "ファイルを選択", - "info": ".pdf, .txt, .rst, .docx, .md, .zipファイルを25MBまでアップロードしてください", + "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください", "uploadedFiles": "アップロードされたファイル", "cancel": "キャンセル", "train": "トレーニング", diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json index afcef769..35df818b 100644 --- a/frontend/src/locale/zh-TW.json +++ b/frontend/src/locale/zh-TW.json @@ -80,7 +80,7 @@ "remote": "遠端", "name": "名稱", "choose": "選擇檔案", - "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .zip 檔案,大小限制為 25MB", + "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案,大小限制為 25MB", "uploadedFiles": "已上傳的檔案", "cancel": "取消", "train": "訓練", diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index 0d60a701..0abe7a59 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -86,7 +86,7 @@ "start": "开始聊天", "name": "名称", "choose": "选择文件", - "info": "请上传 .pdf, .txt, .rst, .docx, .md, .zip 文件,限 25MB", + "info": "请上传 .pdf, .txt, .rst, .docx, .md, .json, .zip 文件,限 25MB", "uploadedFiles": "已上传文件", "cancel": "取消", "train": "训练", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 37a1fc0c..140fc688 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -314,6 +314,7 @@ function Upload({ 'application/zip': ['.zip'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'application/json': ['.json'], 'text/csv': ['.csv'], 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [ '.xlsx',