Merge branch 'main' of https://github.com/ManishMadan2882/docsgpt

2026-02-13 17:50:47 +00:00 · 2024-10-27 01:28:12 +05:30
parent 0aa9da39a9 1c791f240a
commit 1627d424e7
30 changed files with 554 additions and 86 deletions
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -340,6 +340,8 @@ class UploadFile(Resource):
                        ".epub",
                        ".html",
                        ".mdx",
+                        ".json",
+                        ".xlsx",
                    ],
                    job_name,
                    final_filename,
--- a/application/parser/file/bulk.py
+++ b/application/parser/file/bulk.py
@@ -11,6 +11,7 @@ from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
 from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
+from application.parser.file.json_parser import JSONParser
 from application.parser.schema.base import Document

 DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
@@ -23,6 +24,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
    ".rst": RstParser(),
    ".html": HTMLParser(),
    ".mdx": MarkdownParser(),
+    ".json":JSONParser(),
 }


--- a/application/parser/file/json_parser.py
+++ b/application/parser/file/json_parser.py
@@ -0,0 +1,57 @@
+import json
+from typing import Any, Dict, List, Union
+from pathlib import Path
+
+from application.parser.file.base_parser import BaseParser
+
+class JSONParser(BaseParser):
+    r"""JSON (.json) parser.
+
+    Parses JSON files into a list of strings or a concatenated document.
+    It handles both JSON objects (dictionaries) and arrays (lists).
+
+    Args:
+        concat_rows (bool): Whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each item in the JSON.
+            True by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        json_config (dict): Options for parsing JSON. Can be used to specify options like
+        custom decoding or formatting. Set to empty dict by default.
+
+    """
+
+    def __init__(
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            row_joiner: str = "\n",
+            json_config: dict = {},
+            **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._row_joiner = row_joiner
+        self._json_config = json_config
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
+        """Parse JSON file."""
+        
+        with open(file, 'r', encoding='utf-8') as f:
+                data = json.load(f, **self._json_config)
+
+        if isinstance(data, dict):
+            data = [data]
+
+        if self._concat_rows:
+            return self._row_joiner.join([str(item) for item in data])
+        else:
+            return data
--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -3,6 +3,7 @@ import requests
 from typing import List
 from application.parser.remote.base import BaseRemote
 from langchain_core.documents import Document
+import mimetypes

 class GitHubLoader(BaseRemote):
    def __init__(self):
@@ -18,13 +19,17 @@ class GitHubLoader(BaseRemote):

        if response.status_code == 200:
            content = response.json()
+            mime_type, _ = mimetypes.guess_type(file_path)  # Guess the MIME type based on the file extension
+
            if content.get("encoding") == "base64":
-                try:
-                    decoded_content = base64.b64decode(content["content"]).decode("utf-8")
-                    return f"Filename: {file_path}\n\n{decoded_content}"
-                except Exception as e:
-                    print(f"Error decoding content for {file_path}: {e}")
-                    raise
+                if mime_type and mime_type.startswith("text"):  # Handle only text files
+                    try:
+                        decoded_content = base64.b64decode(content["content"]).decode("utf-8")
+                        return f"Filename: {file_path}\n\n{decoded_content}"
+                    except Exception as e:
+                        raise e
+                else:
+                    return f"Filename: {file_path} is a binary file and was skipped."
            else:
                return f"Filename: {file_path}\n\n{content['content']}"
        else: