From a4c0861cf4b79d08338147664531a0dae1f38ce7 Mon Sep 17 00:00:00 2001
From: "devendra.parihar" <devendra.parihar@heliossolutions.co>
Date: Fri, 18 Oct 2024 12:07:44 +0530
Subject: [PATCH 1/4] fix:GitHubLoader to Handle Binary Files

---
 application/parser/remote/drive_loader.py  | 65 ++++++++++++++++++++++
 application/parser/remote/github_loader.py | 19 +++++--
 2 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 application/parser/remote/drive_loader.py

diff --git a/application/parser/remote/drive_loader.py b/application/parser/remote/drive_loader.py
new file mode 100644
index 00000000..5931e5aa
--- /dev/null
+++ b/application/parser/remote/drive_loader.py
@@ -0,0 +1,65 @@
+import os
+from typing import List
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+from google.auth.transport.requests import Request
+from io import BytesIO
+from langchain_core.documents import Document
+from application.parser.remote.base import BaseRemote
+
+SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
+
+class GoogleDriveLoader(BaseRemote):
+    def __init__(self, token_path: str, credentials_path: str):
+        # Load OAuth2 credentials from token and credentials JSON files
+        self.creds = None
+        if os.path.exists(token_path):
+            self.creds = Credentials.from_authorized_user_file(token_path, SCOPES)
+        if not self.creds or not self.creds.valid:
+            if self.creds and self.creds.expired and self.creds.refresh_token:
+                self.creds.refresh(Request())
+            else:
+                raise Exception("Invalid or missing credentials. Please authenticate.")
+
+        # Initialize the Google Drive API client
+        self.service = build('drive', 'v3', credentials=self.creds)
+
+    def fetch_file_content(self, file_id: str) -> str:
+        request = self.service.files().get_media(fileId=file_id)
+        file_io = BytesIO()
+        downloader = MediaIoBaseDownload(file_io, request)
+
+        done = False
+        while not done:
+            status, done = downloader.next_chunk()
+
+        file_io.seek(0)
+        return file_io.read().decode("utf-8")
+
+    def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]:
+        query = f"'{folder_id}' in parents"
+        if mime_type_filter:
+            mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter])
+            query += f" and ({mime_types_query})"
+
+        results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute()
+        return results.get('files', [])
+
+    def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]:
+        # Fetch the list of files within the specified folder
+        files = self.fetch_drive_files(folder_id, mime_type_filter)
+        documents = []
+        
+        # Loop over each file, download its content, and convert it into a document
+        for file in files:
+            if file['mimeType'] != 'application/vnd.google-apps.folder':
+                try:
+                    content = self.fetch_file_content(file['id'])
+                    documents.append(Document(page_content=content, metadata={
+                        "title": file['name'],
+                        "source": f"https://drive.google.com/file/d/{file['id']}/view"
+                    }))
+                except Exception as e:
+                    print(f"Failed to load file {file['name']}: {e}")
+        return documents
diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py
index 49f0ae9c..95c04102 100644
--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -3,6 +3,7 @@ import requests
 from typing import List
 from application.parser.remote.base import BaseRemote
 from langchain_core.documents import Document
+import mimetypes
 
 class GitHubLoader(BaseRemote):
     def __init__(self):
@@ -18,13 +19,19 @@ class GitHubLoader(BaseRemote):
 
         if response.status_code == 200:
             content = response.json()
+            mime_type, _ = mimetypes.guess_type(file_path)  # Guess the MIME type based on the file extension
+
             if content.get("encoding") == "base64":
-                try:
-                    decoded_content = base64.b64decode(content["content"]).decode("utf-8")
-                    return f"Filename: {file_path}\n\n{decoded_content}"
-                except Exception as e:
-                    print(f"Error decoding content for {file_path}: {e}")
-                    raise
+                if mime_type and mime_type.startswith("text"):  # Handle only text files
+                    try:
+                        decoded_content = base64.b64decode(content["content"]).decode("utf-8")
+                        return f"Filename: {file_path}\n\n{decoded_content}"
+                    except Exception as e:
+                        # print(f"Error decoding content for {file_path}: {e}")
+                        raise
+                else:
+                    # print(f"Skipping binary file: {file_path} (MIME type: {mime_type})")
+                    return f"Filename: {file_path} is a binary file and was skipped."
             else:
                 return f"Filename: {file_path}\n\n{content['content']}"
         else:

From 09a27053117a0f55450ab7734f573378b8e360a0 Mon Sep 17 00:00:00 2001
From: "devendra.parihar" <devendra.parihar@heliossolutions.co>
Date: Fri, 18 Oct 2024 12:08:08 +0530
Subject: [PATCH 2/4] fix:GitHubLoader to Handle Binary Files

---
 application/parser/remote/drive_loader.py | 65 -----------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 application/parser/remote/drive_loader.py

diff --git a/application/parser/remote/drive_loader.py b/application/parser/remote/drive_loader.py
deleted file mode 100644
index 5931e5aa..00000000
--- a/application/parser/remote/drive_loader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-from typing import List
-from google.oauth2.credentials import Credentials
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaIoBaseDownload
-from google.auth.transport.requests import Request
-from io import BytesIO
-from langchain_core.documents import Document
-from application.parser.remote.base import BaseRemote
-
-SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
-
-class GoogleDriveLoader(BaseRemote):
-    def __init__(self, token_path: str, credentials_path: str):
-        # Load OAuth2 credentials from token and credentials JSON files
-        self.creds = None
-        if os.path.exists(token_path):
-            self.creds = Credentials.from_authorized_user_file(token_path, SCOPES)
-        if not self.creds or not self.creds.valid:
-            if self.creds and self.creds.expired and self.creds.refresh_token:
-                self.creds.refresh(Request())
-            else:
-                raise Exception("Invalid or missing credentials. Please authenticate.")
-
-        # Initialize the Google Drive API client
-        self.service = build('drive', 'v3', credentials=self.creds)
-
-    def fetch_file_content(self, file_id: str) -> str:
-        request = self.service.files().get_media(fileId=file_id)
-        file_io = BytesIO()
-        downloader = MediaIoBaseDownload(file_io, request)
-
-        done = False
-        while not done:
-            status, done = downloader.next_chunk()
-
-        file_io.seek(0)
-        return file_io.read().decode("utf-8")
-
-    def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]:
-        query = f"'{folder_id}' in parents"
-        if mime_type_filter:
-            mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter])
-            query += f" and ({mime_types_query})"
-
-        results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute()
-        return results.get('files', [])
-
-    def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]:
-        # Fetch the list of files within the specified folder
-        files = self.fetch_drive_files(folder_id, mime_type_filter)
-        documents = []
-        
-        # Loop over each file, download its content, and convert it into a document
-        for file in files:
-            if file['mimeType'] != 'application/vnd.google-apps.folder':
-                try:
-                    content = self.fetch_file_content(file['id'])
-                    documents.append(Document(page_content=content, metadata={
-                        "title": file['name'],
-                        "source": f"https://drive.google.com/file/d/{file['id']}/view"
-                    }))
-                except Exception as e:
-                    print(f"Failed to load file {file['name']}: {e}")
-        return documents

From d3238de8abd51252e84ce3d6b6a2c131bab9b07d Mon Sep 17 00:00:00 2001
From: "devendra.parihar" <devendra.parihar@heliossolutions.co>
Date: Fri, 18 Oct 2024 12:23:17 +0530
Subject: [PATCH 3/4] fix: lint error

---
 application/parser/remote/github_loader.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py
index 95c04102..8f805056 100644
--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -27,10 +27,8 @@ class GitHubLoader(BaseRemote):
                         decoded_content = base64.b64decode(content["content"]).decode("utf-8")
                         return f"Filename: {file_path}\n\n{decoded_content}"
                     except Exception as e:
-                        # print(f"Error decoding content for {file_path}: {e}")
-                        raise
+                        raise e
                 else:
-                    # print(f"Skipping binary file: {file_path} (MIME type: {mime_type})")
                     return f"Filename: {file_path} is a binary file and was skipped."
             else:
                 return f"Filename: {file_path}\n\n{content['content']}"

From c77d415893bdb227224bb21e6e145f5c2418b93e Mon Sep 17 00:00:00 2001
From: JeevaRamanathan M <jeevaramanathan.m@infosys.com>
Date: Thu, 24 Oct 2024 20:36:47 +0000
Subject: [PATCH 4/4] feat: JSON parser implementation

Signed-off-by: JeevaRamanathan M <jeevaramanathan.m@infosys.com>
---
 application/api/user/routes.py         |  1 +
 application/parser/file/bulk.py        |  2 +
 application/parser/file/json_parser.py | 57 ++++++++++++++++++++++++++
 frontend/src/index.css                 |  2 -
 frontend/src/locale/en.json            |  2 +-
 frontend/src/locale/es.json            |  2 +-
 frontend/src/locale/jp.json            |  2 +-
 frontend/src/locale/zh-TW.json         |  2 +-
 frontend/src/locale/zh.json            |  2 +-
 frontend/src/upload/Upload.tsx         |  1 +
 10 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 application/parser/file/json_parser.py

diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index 2ead8ef1..b476345d 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -340,6 +340,7 @@ class UploadFile(Resource):
                         ".epub",
                         ".html",
                         ".mdx",
+                        ".json"
                     ],
                     job_name,
                     final_filename,
diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
index 79fc2c45..bb63aa61 100644
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -11,6 +11,7 @@ from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
 from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
+from application.parser.file.json_parser import JSONParser
 from application.parser.schema.base import Document
 
 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -23,6 +24,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
     ".rst": RstParser(),
     ".html": HTMLParser(),
     ".mdx": MarkdownParser(),
+    ".json":JSONParser(),
 }
 
 
diff --git a/application/parser/file/json_parser.py b/application/parser/file/json_parser.py
new file mode 100644
index 00000000..0201b420
--- /dev/null
+++ b/application/parser/file/json_parser.py
@@ -0,0 +1,57 @@
+import json
+from typing import Any, Dict, List, Union
+from pathlib import Path
+
+from application.parser.file.base_parser import BaseParser
+
+class JSONParser(BaseParser):
+    r"""JSON (.json) parser.
+
+    Parses JSON files into a list of strings or a concatenated document.
+    It handles both JSON objects (dictionaries) and arrays (lists).
+
+    Args:
+        concat_rows (bool): Whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each item in the JSON.
+            True by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        json_config (dict): Options for parsing JSON. Can be used to specify options like
+        custom decoding or formatting. Set to empty dict by default.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            row_joiner: str = "\n",
+            json_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._row_joiner = row_joiner
+        self._json_config = json_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse JSON file."""
+        
+        with open(file, 'r', encoding='utf-8') as f:
+                data = json.load(f, **self._json_config)
+
+        if isinstance(data, dict):
+            data = [data]
+
+        if self._concat_rows:
+            return self._row_joiner.join([str(item) for item in data])
+        else:
+            return data
diff --git a/frontend/src/index.css b/frontend/src/index.css
index 1eca983c..9b87724a 100644
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -67,10 +67,8 @@ body.dark {
   .table-default td:last-child {
     @apply border-r-0; /* Ensure no right border on the last column */
   }
-
 }
 
-
 /*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */
 
 /* Document
diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json
index cefb99b7..ec77b7e0 100644
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -86,7 +86,7 @@
       "start": "Start Chatting",
       "name": "Name",
       "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .json, .zip limited to 25mb",
       "uploadedFiles": "Uploaded Files",
       "cancel": "Cancel",
       "train": "Train",
diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json
index 66b457e8..9ee7d566 100644
--- a/frontend/src/locale/es.json
+++ b/frontend/src/locale/es.json
@@ -86,7 +86,7 @@
       "start": "Empezar a chatear",
       "name": "Nombre",
       "choose": "Seleccionar Archivos",
-      "info": "Por favor, suba archivos .pdf, .txt, .rst, .docx, .md, .zip limitados a 25 MB",
+      "info": "Por favor, suba archivos .pdf, .txt, .rst, .docx, .md, .json, .zip limitados a 25 MB",
       "uploadedFiles": "Archivos Subidos",
       "cancel": "Cancelar",
       "train": "Entrenar",
diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json
index 53f1da14..841a477b 100644
--- a/frontend/src/locale/jp.json
+++ b/frontend/src/locale/jp.json
@@ -86,7 +86,7 @@
       "start": "チャットを開始する",
       "name": "名前",
       "choose": "ファイルを選択",
-      "info": ".pdf, .txt, .rst, .docx, .md, .zipファイルを25MBまでアップロードしてください",
+      "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください",
       "uploadedFiles": "アップロードされたファイル",
       "cancel": "キャンセル",
       "train": "トレーニング",
diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json
index afcef769..35df818b 100644
--- a/frontend/src/locale/zh-TW.json
+++ b/frontend/src/locale/zh-TW.json
@@ -80,7 +80,7 @@
       "remote": "遠端",
       "name": "名稱",
       "choose": "選擇檔案",
-      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .zip 檔案，大小限制為 25MB",
+      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案，大小限制為 25MB",
       "uploadedFiles": "已上傳的檔案",
       "cancel": "取消",
       "train": "訓練",
diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json
index 0d60a701..0abe7a59 100644
--- a/frontend/src/locale/zh.json
+++ b/frontend/src/locale/zh.json
@@ -86,7 +86,7 @@
       "start": "开始聊天",
       "name": "名称",
       "choose": "选择文件",
-      "info": "请上传 .pdf, .txt, .rst, .docx, .md, .zip 文件，限 25MB",
+      "info": "请上传 .pdf, .txt, .rst, .docx, .md, .json, .zip 文件，限 25MB",
       "uploadedFiles": "已上传文件",
       "cancel": "取消",
       "train": "训练",
diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index 37a1fc0c..140fc688 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -314,6 +314,7 @@ function Upload({
       'application/zip': ['.zip'],
       'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
         ['.docx'],
+      'application/json': ['.json'],
       'text/csv': ['.csv'],
       'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [
         '.xlsx',