fix:GitHubLoader to Handle Binary Files

This commit is contained in:
devendra.parihar
2024-10-18 12:07:44 +05:30
parent 7bf79675c1
commit a4c0861cf4
2 changed files with 78 additions and 6 deletions

View File

@@ -0,0 +1,65 @@
import os
from typing import List
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.auth.transport.requests import Request
from io import BytesIO
from langchain_core.documents import Document
from application.parser.remote.base import BaseRemote
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
class GoogleDriveLoader(BaseRemote):
def __init__(self, token_path: str, credentials_path: str):
# Load OAuth2 credentials from token and credentials JSON files
self.creds = None
if os.path.exists(token_path):
self.creds = Credentials.from_authorized_user_file(token_path, SCOPES)
if not self.creds or not self.creds.valid:
if self.creds and self.creds.expired and self.creds.refresh_token:
self.creds.refresh(Request())
else:
raise Exception("Invalid or missing credentials. Please authenticate.")
# Initialize the Google Drive API client
self.service = build('drive', 'v3', credentials=self.creds)
def fetch_file_content(self, file_id: str) -> str:
request = self.service.files().get_media(fileId=file_id)
file_io = BytesIO()
downloader = MediaIoBaseDownload(file_io, request)
done = False
while not done:
status, done = downloader.next_chunk()
file_io.seek(0)
return file_io.read().decode("utf-8")
def fetch_drive_files(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[dict]:
query = f"'{folder_id}' in parents"
if mime_type_filter:
mime_types_query = " or ".join([f"mimeType='{mime_type}'" for mime_type in mime_type_filter])
query += f" and ({mime_types_query})"
results = self.service.files().list(q=query, pageSize=1000, fields="files(id, name, mimeType)").execute()
return results.get('files', [])
def load_data(self, folder_id: str = 'root', mime_type_filter: List[str] = None) -> List[Document]:
# Fetch the list of files within the specified folder
files = self.fetch_drive_files(folder_id, mime_type_filter)
documents = []
# Loop over each file, download its content, and convert it into a document
for file in files:
if file['mimeType'] != 'application/vnd.google-apps.folder':
try:
content = self.fetch_file_content(file['id'])
documents.append(Document(page_content=content, metadata={
"title": file['name'],
"source": f"https://drive.google.com/file/d/{file['id']}/view"
}))
except Exception as e:
print(f"Failed to load file {file['name']}: {e}")
return documents