mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 08:33:20 +00:00
fix:GitHubLoader to Handle Binary Files
This commit is contained in:
65
application/parser/remote/drive_loader.py
Normal file
65
application/parser/remote/drive_loader.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import os
|
||||
from typing import List
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaIoBaseDownload
|
||||
from google.auth.transport.requests import Request
|
||||
from io import BytesIO
|
||||
from langchain_core.documents import Document
|
||||
from application.parser.remote.base import BaseRemote
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
||||
|
||||
class GoogleDriveLoader(BaseRemote):
|
||||
def __init__(self, token_path: str, credentials_path: str):
|
||||
# Load OAuth2 credentials from token and credentials JSON files
|
||||
self.creds = None
|
||||
if os.path.exists(token_path):
|
||||
self.creds = Credentials.from_authorized_user_file(token_path, SCOPES)
|
||||
if not self.creds or not self.creds.valid:
|
||||
if self.creds and self.creds.expired and self.creds.refresh_token:
|
||||
self.creds.refresh(Request())
|
||||
else:
|
||||
raise Exception("Invalid or missing credentials. Please authenticate.")
|
||||
|
||||
# Initialize the Google Drive API client
|
||||
self.service = build('drive', 'v3', credentials=self.creds)
|
||||
|
||||
def fetch_file_content(self, file_id: str) -> str:
|
||||
request = self.service.files().get_media(fileId=file_id)
|
||||
file_io = BytesIO()
|
||||
downloader = MediaIoBaseDownload(file_io, request)
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
status, done = downloader.next_chunk()
|
||||
|
||||
file_io.seek(0)
|
||||
return file_io.read().decode("utf-8")
|
||||
|
||||
def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]:
|
||||
query = f"'{folder_id}' in parents"
|
||||
if mime_type_filter:
|
||||
mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter])
|
||||
query += f" and ({mime_types_query})"
|
||||
|
||||
results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute()
|
||||
return results.get('files', [])
|
||||
|
||||
def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]:
|
||||
# Fetch the list of files within the specified folder
|
||||
files = self.fetch_drive_files(folder_id, mime_type_filter)
|
||||
documents = []
|
||||
|
||||
# Loop over each file, download its content, and convert it into a document
|
||||
for file in files:
|
||||
if file['mimeType'] != 'application/vnd.google-apps.folder':
|
||||
try:
|
||||
content = self.fetch_file_content(file['id'])
|
||||
documents.append(Document(page_content=content, metadata={
|
||||
"title": file['name'],
|
||||
"source": f"https://drive.google.com/file/d/{file['id']}/view"
|
||||
}))
|
||||
except Exception as e:
|
||||
print(f"Failed to load file {file['name']}: {e}")
|
||||
return documents
|
||||
@@ -3,6 +3,7 @@ import requests
|
||||
from typing import List
|
||||
from application.parser.remote.base import BaseRemote
|
||||
from langchain_core.documents import Document
|
||||
import mimetypes
|
||||
|
||||
class GitHubLoader(BaseRemote):
|
||||
def __init__(self):
|
||||
@@ -18,13 +19,19 @@ class GitHubLoader(BaseRemote):
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.json()
|
||||
mime_type, _ = mimetypes.guess_type(file_path) # Guess the MIME type based on the file extension
|
||||
|
||||
if content.get("encoding") == "base64":
|
||||
try:
|
||||
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
|
||||
return f"Filename: {file_path}\n\n{decoded_content}"
|
||||
except Exception as e:
|
||||
print(f"Error decoding content for {file_path}: {e}")
|
||||
raise
|
||||
if mime_type and mime_type.startswith("text"): # Handle only text files
|
||||
try:
|
||||
decoded_content = base64.b64decode(content["content"]).decode("utf-8")
|
||||
return f"Filename: {file_path}\n\n{decoded_content}"
|
||||
except Exception as e:
|
||||
# print(f"Error decoding content for {file_path}: {e}")
|
||||
raise
|
||||
else:
|
||||
# print(f"Skipping binary file: {file_path} (MIME type: {mime_type})")
|
||||
return f"Filename: {file_path} is a binary file and was skipped."
|
||||
else:
|
||||
return f"Filename: {file_path}\n\n{content['content']}"
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user