fix:GitHubLoader to Handle Binary Files

2025-11-29 08:33:20 +00:00 · 2024-10-18 12:07:44 +05:30
parent 7bf79675c1
commit a4c0861cf4
2 changed files with 78 additions and 6 deletions
--- a/application/parser/remote/drive_loader.py
+++ b/application/parser/remote/drive_loader.py
@@ -0,0 +1,65 @@
+import os
+from typing import List
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+from google.auth.transport.requests import Request
+from io import BytesIO
+from langchain_core.documents import Document
+from application.parser.remote.base import BaseRemote
+
+SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
+
+class GoogleDriveLoader(BaseRemote):
+    def __init__(self, token_path: str, credentials_path: str):
+        # Load OAuth2 credentials from token and credentials JSON files
+        self.creds = None
+        if os.path.exists(token_path):
+            self.creds = Credentials.from_authorized_user_file(token_path, SCOPES)
+        if not self.creds or not self.creds.valid:
+            if self.creds and self.creds.expired and self.creds.refresh_token:
+                self.creds.refresh(Request())
+            else:
+                raise Exception("Invalid or missing credentials. Please authenticate.")
+
+        # Initialize the Google Drive API client
+        self.service = build('drive', 'v3', credentials=self.creds)
+
+    def fetch_file_content(self, file_id: str) -> str:
+        request = self.service.files().get_media(fileId=file_id)
+        file_io = BytesIO()
+        downloader = MediaIoBaseDownload(file_io, request)
+
+        done = False
+        while not done:
+            status, done = downloader.next_chunk()
+
+        file_io.seek(0)
+        return file_io.read().decode("utf-8")
+
+    def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]:
+        query = f"'{folder_id}' in parents"
+        if mime_type_filter:
+            mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter])
+            query += f" and ({mime_types_query})"
+
+        results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute()
+        return results.get('files', [])
+
+    def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]:
+        # Fetch the list of files within the specified folder
+        files = self.fetch_drive_files(folder_id, mime_type_filter)
+        documents = []
+        
+        # Loop over each file, download its content, and convert it into a document
+        for file in files:
+            if file['mimeType'] != 'application/vnd.google-apps.folder':
+                try:
+                    content = self.fetch_file_content(file['id'])
+                    documents.append(Document(page_content=content, metadata={
+                        "title": file['name'],
+                        "source": f"https://drive.google.com/file/d/{file['id']}/view"
+                    }))
+                except Exception as e:
+                    print(f"Failed to load file {file['name']}: {e}")
+        return documents
--- a/application/parser/remote/github_loader.py
+++ b/application/parser/remote/github_loader.py
@@ -3,6 +3,7 @@ import requests
 from typing import List
 from application.parser.remote.base import BaseRemote
 from langchain_core.documents import Document
+import mimetypes

 class GitHubLoader(BaseRemote):
    def __init__(self):
@@ -18,13 +19,19 @@ class GitHubLoader(BaseRemote):

        if response.status_code == 200:
            content = response.json()
+            mime_type, _ = mimetypes.guess_type(file_path)  # Guess the MIME type based on the file extension
+
            if content.get("encoding") == "base64":
-                try:
-                    decoded_content = base64.b64decode(content["content"]).decode("utf-8")
-                    return f"Filename: {file_path}\n\n{decoded_content}"
-                except Exception as e:
-                    print(f"Error decoding content for {file_path}: {e}")
-                    raise
+                if mime_type and mime_type.startswith("text"):  # Handle only text files
+                    try:
+                        decoded_content = base64.b64decode(content["content"]).decode("utf-8")
+                        return f"Filename: {file_path}\n\n{decoded_content}"
+                    except Exception as e:
+                        # print(f"Error decoding content for {file_path}: {e}")
+                        raise
+                else:
+                    # print(f"Skipping binary file: {file_path} (MIME type: {mime_type})")
+                    return f"Filename: {file_path} is a binary file and was skipped."
            else:
                return f"Filename: {file_path}\n\n{content['content']}"
        else: