Merge branch 'arc53:main' into feature-TTS

2026-02-16 03:00:53 +00:00 · 2024-10-27 10:46:36 +05:30
parent 05f756963c 1c791f240a
commit 09a1879f3e
11 changed files with 77 additions and 13 deletions
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -340,6 +340,7 @@ class UploadFile(Resource):
                        ".epub",
                        ".html",
                        ".mdx",
+                        ".json",
                        ".xlsx",
                    ],
                    job_name,
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -11,6 +11,7 @@ from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
 from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
+from application.parser.file.json_parser import JSONParser
 from application.parser.schema.base import Document

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -23,6 +24,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".rst": RstParser(),
    ".html": HTMLParser(),
    ".mdx": MarkdownParser(),
+    ".json":JSONParser(),
 }


--- a/application/parser/file/json_parser.py
+++ b/application/parser/file/json_parser.py
@@ -0,0 +1,57 @@
+import json
+from typing import Any, Dict, List, Union
+from pathlib import Path
+
+from application.parser.file.base_parser import BaseParser
+
+class JSONParser(BaseParser):
+    r"""JSON (.json) parser.
+
+    Parses JSON files into a list of strings or a concatenated document.
+    It handles both JSON objects (dictionaries) and arrays (lists).
+
+    Args:
+        concat_rows (bool): Whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each item in the JSON.
+            True by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        json_config (dict): Options for parsing JSON. Can be used to specify options like
+        custom decoding or formatting. Set to empty dict by default.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            row_joiner: str = "\n",
+            json_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._row_joiner = row_joiner
+        self._json_config = json_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse JSON file."""
+        
+        with open(file, 'r', encoding='utf-8') as f:
+                data = json.load(f, **self._json_config)
+
+        if isinstance(data, dict):
+            data = [data]
+
+        if self._concat_rows:
+            return self._row_joiner.join([str(item) for item in data])
+        else:
+            return data
--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -3,6 +3,7 @@ import requests
 from typing import List
 from application.parser.remote.base import BaseRemote
 from langchain_core.documents import Document
+import mimetypes

 class GitHubLoader(BaseRemote):
    def __init__(self):
@@ -18,13 +19,17 @@ class GitHubLoader(BaseRemote):

        if response.status_code == 200:
            content = response.json()
+            mime_type, _ = mimetypes.guess_type(file_path)  # Guess the MIME type based on the file extension
+
            if content.get("encoding") == "base64":
-                try:
-                    decoded_content = base64.b64decode(content["content"]).decode("utf-8")
-                    return f"Filename: {file_path}\n\n{decoded_content}"
-                except Exception as e:
-                    print(f"Error decoding content for {file_path}: {e}")
-                    raise
+                if mime_type and mime_type.startswith("text"):  # Handle only text files
+                    try:
+                        decoded_content = base64.b64decode(content["content"]).decode("utf-8")
+                        return f"Filename: {file_path}\n\n{decoded_content}"
+                    except Exception as e:
+                        raise e
+                else:
+                    return f"Filename: {file_path} is a binary file and was skipped."
            else:
                return f"Filename: {file_path}\n\n{content['content']}"
        else:
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -67,10 +67,8 @@ body.dark {
  .table-default td:last-child {
    @apply border-r-0; /* Ensure no right border on the last column */
  }
-
 }

-
 /*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */

 /* Document
--- a/frontend/src/locale/en.json
+++ b/frontend/src/locale/en.json
@@ -86,7 +86,7 @@
      "start": "Start Chatting",
      "name": "Name",
      "choose": "Choose Files",
-      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .zip limited to 25mb",
+      "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb",
      "uploadedFiles": "Uploaded Files",
      "cancel": "Cancel",
      "train": "Train",
--- a/frontend/src/locale/es.json
+++ b/frontend/src/locale/es.json
@@ -86,7 +86,7 @@
      "start": "Empezar a chatear",
      "name": "Nombre",
      "choose": "Seleccionar Archivos",
-      "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .zip limitados a 25 MB",
+      "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB",
      "uploadedFiles": "Archivos Subidos",
      "cancel": "Cancelar",
      "train": "Entrenar",
--- a/frontend/src/locale/jp.json
+++ b/frontend/src/locale/jp.json
@@ -86,7 +86,7 @@
      "start": "チャットを開始する",
      "name": "名前",
      "choose": "ファイルを選択",
-      "info": ".pdf, .txt, .rst, .docx, .md, .zipファイルを25MBまでアップロードしてください",
+      "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください",
      "uploadedFiles": "アップロードされたファイル",
      "cancel": "キャンセル",
      "train": "トレーニング",
--- a/frontend/src/locale/zh-TW.json
+++ b/frontend/src/locale/zh-TW.json
@@ -80,7 +80,7 @@
      "remote": "遠端",
      "name": "名稱",
      "choose": "選擇檔案",
-      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .zip 檔案，大小限制為 25MB",
+      "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案，大小限制為 25MB",
      "uploadedFiles": "已上傳的檔案",
      "cancel": "取消",
      "train": "訓練",
--- a/frontend/src/locale/zh.json
+++ b/frontend/src/locale/zh.json
@@ -86,7 +86,7 @@
      "start": "开始聊天",
      "name": "名称",
      "choose": "选择文件",
-      "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .zip 文件，限 25MB",
+      "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件，限 25MB",
      "uploadedFiles": "已上传文件",
      "cancel": "取消",
      "train": "训练",
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -314,6 +314,7 @@ function Upload({
      'application/zip': ['.zip'],
      'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        ['.docx'],
+      'application/json': ['.json'],
      'text/csv': ['.csv'],
      'text/html': ['.html'],
      'application/epub+zip': ['.epub'],