mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-29 00:23:17 +00:00
(feat:ingestion) google drive loader
This commit is contained in:
501
application/parser/remote/google_drive_loader.py
Normal file
501
application/parser/remote/google_drive_loader.py
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
"""
|
||||||
|
Google Drive loader for DocsGPT.
|
||||||
|
Loads documents from Google Drive using Google Drive API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
from googleapiclient.http import MediaIoBaseDownload
|
||||||
|
from googleapiclient.errors import HttpError
|
||||||
|
|
||||||
|
from application.parser.remote.base import BaseRemote
|
||||||
|
from application.parser.remote.google_auth import GoogleDriveAuth
|
||||||
|
from application.parser.schema.base import Document
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleDriveLoader(BaseRemote):
|
||||||
|
|
||||||
|
SUPPORTED_MIME_TYPES = {
|
||||||
|
'application/pdf': '.pdf',
|
||||||
|
'application/vnd.google-apps.document': '.docx',
|
||||||
|
'application/vnd.google-apps.presentation': '.pptx',
|
||||||
|
'application/vnd.google-apps.spreadsheet': '.xlsx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
||||||
|
'application/msword': '.doc',
|
||||||
|
'application/vnd.ms-powerpoint': '.ppt',
|
||||||
|
'application/vnd.ms-excel': '.xls',
|
||||||
|
'text/plain': '.txt',
|
||||||
|
'text/csv': '.csv',
|
||||||
|
'text/html': '.html',
|
||||||
|
'application/rtf': '.rtf',
|
||||||
|
'image/jpeg': '.jpg',
|
||||||
|
'image/jpg': '.jpg',
|
||||||
|
'image/png': '.png',
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPORT_FORMATS = {
|
||||||
|
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/vnd.google-apps.presentation': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
'application/vnd.google-apps.spreadsheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, session_token: str):
|
||||||
|
self.auth = GoogleDriveAuth()
|
||||||
|
self.session_token = session_token
|
||||||
|
|
||||||
|
token_info = self.auth.get_token_info_from_session(session_token)
|
||||||
|
self.credentials = self.auth.create_credentials_from_token_info(token_info)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.service = self.auth.build_drive_service(self.credentials)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not build Google Drive service: {e}")
|
||||||
|
self.service = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _process_file(self, file_metadata: Dict[str, Any], load_content: bool = True) -> Optional[Document]:
|
||||||
|
try:
|
||||||
|
file_id = file_metadata.get('id')
|
||||||
|
file_name = file_metadata.get('name', 'Unknown')
|
||||||
|
mime_type = file_metadata.get('mimeType', 'application/octet-stream')
|
||||||
|
|
||||||
|
if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'):
|
||||||
|
return None
|
||||||
|
if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'):
|
||||||
|
logging.info(f"Skipping unsupported file type: {mime_type} for file {file_name}")
|
||||||
|
return None
|
||||||
|
# Google Drive provides timezone-aware ISO8601 dates
|
||||||
|
doc_metadata = {
|
||||||
|
'file_name': file_name,
|
||||||
|
'mime_type': mime_type,
|
||||||
|
'size': file_metadata.get('size', 'Unknown'),
|
||||||
|
'created_time': file_metadata.get('createdTime'),
|
||||||
|
'modified_time': file_metadata.get('modifiedTime'),
|
||||||
|
'parents': file_metadata.get('parents', []),
|
||||||
|
'source': 'google_drive'
|
||||||
|
}
|
||||||
|
|
||||||
|
if not load_content:
|
||||||
|
return Document(
|
||||||
|
text="",
|
||||||
|
doc_id=file_id,
|
||||||
|
extra_info=doc_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
content = self._download_file_content(file_id, mime_type)
|
||||||
|
if content is None:
|
||||||
|
logging.warning(f"Could not load content for file {file_name} ({file_id})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
text=content,
|
||||||
|
doc_id=file_id,
|
||||||
|
extra_info=doc_metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error processing file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def load_data(self, inputs: Dict[str, Any]) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Load items from Google Drive according to simple browsing semantics.
|
||||||
|
|
||||||
|
Behavior:
|
||||||
|
- If file_ids are provided: return those files (optionally with content).
|
||||||
|
- If folder_id is provided: return the immediate children (folders and files) of that folder.
|
||||||
|
- If no folder_id: return the immediate children (folders and files) of Drive 'root'.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs: Dictionary containing configuration:
|
||||||
|
- folder_id: Optional Google Drive folder ID whose direct children to list
|
||||||
|
- file_ids: Optional list of specific file IDs to load
|
||||||
|
- limit: Maximum number of items to return
|
||||||
|
- list_only: If True, only return metadata without content
|
||||||
|
- session_token: Optional session token to use for authentication (backward compatibility)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Document objects (folders are returned as metadata-only documents)
|
||||||
|
"""
|
||||||
|
session_token = inputs.get('session_token')
|
||||||
|
if session_token and session_token != self.session_token:
|
||||||
|
logging.warning("Session token in inputs differs from loader's session token. Using loader's session token.")
|
||||||
|
self.config = inputs
|
||||||
|
|
||||||
|
try:
|
||||||
|
documents: List[Document] = []
|
||||||
|
|
||||||
|
folder_id = inputs.get('folder_id')
|
||||||
|
file_ids = inputs.get('file_ids', [])
|
||||||
|
limit = inputs.get('limit', 100)
|
||||||
|
list_only = inputs.get('list_only', False)
|
||||||
|
load_content = not list_only
|
||||||
|
|
||||||
|
if file_ids:
|
||||||
|
# Specific files requested: load them
|
||||||
|
for file_id in file_ids:
|
||||||
|
try:
|
||||||
|
doc = self._load_file_by_id(file_id, load_content=load_content)
|
||||||
|
if doc:
|
||||||
|
documents.append(doc)
|
||||||
|
elif hasattr(self, '_credential_refreshed') and self._credential_refreshed:
|
||||||
|
self._credential_refreshed = False
|
||||||
|
logging.info(f"Retrying load of file {file_id} after credential refresh")
|
||||||
|
doc = self._load_file_by_id(file_id, load_content=load_content)
|
||||||
|
if doc:
|
||||||
|
documents.append(doc)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error loading file {file_id}: {e}")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Browsing mode: list immediate children of provided folder or root
|
||||||
|
parent_id = folder_id if folder_id else 'root'
|
||||||
|
documents = self._list_items_in_parent(parent_id, limit=limit, load_content=load_content)
|
||||||
|
|
||||||
|
logging.info(f"Loaded {len(documents)} documents from Google Drive")
|
||||||
|
return documents
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error loading data from Google Drive: {e}", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]:
|
||||||
|
self._ensure_service()
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_metadata = self.service.files().get(
|
||||||
|
fileId=file_id,
|
||||||
|
fields='id,name,mimeType,size,createdTime,modifiedTime,parents'
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
return self._process_file(file_metadata, load_content=load_content)
|
||||||
|
|
||||||
|
except HttpError as e:
|
||||||
|
logging.error(f"HTTP error loading file {file_id}: {e.resp.status} - {e.content}")
|
||||||
|
|
||||||
|
if e.resp.status in [401, 403]:
|
||||||
|
if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
|
||||||
|
try:
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
self.credentials.refresh(Request())
|
||||||
|
self._ensure_service()
|
||||||
|
return None
|
||||||
|
except Exception as refresh_error:
|
||||||
|
raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}")
|
||||||
|
else:
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token")
|
||||||
|
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error loading file {file_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _list_items_in_parent(self, parent_id: str, limit: int = 100, load_content: bool = False) -> List[Document]:
|
||||||
|
self._ensure_service()
|
||||||
|
|
||||||
|
documents: List[Document] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
query = f"'{parent_id}' in parents and trashed=false"
|
||||||
|
page_token = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
page_size = 100
|
||||||
|
if limit:
|
||||||
|
remaining = max(0, limit - len(documents))
|
||||||
|
if remaining == 0:
|
||||||
|
break
|
||||||
|
page_size = min(100, remaining)
|
||||||
|
|
||||||
|
results = self.service.files().list(
|
||||||
|
q=query,
|
||||||
|
fields='nextPageToken,files(id,name,mimeType,size,createdTime,modifiedTime,parents)',
|
||||||
|
pageToken=page_token,
|
||||||
|
pageSize=page_size
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
items = results.get('files', [])
|
||||||
|
for item in items:
|
||||||
|
mime_type = item.get('mimeType')
|
||||||
|
if mime_type == 'application/vnd.google-apps.folder':
|
||||||
|
doc_metadata = {
|
||||||
|
'file_name': item.get('name', 'Unknown'),
|
||||||
|
'mime_type': mime_type,
|
||||||
|
'size': item.get('size', 'Unknown'),
|
||||||
|
'created_time': item.get('createdTime'),
|
||||||
|
'modified_time': item.get('modifiedTime'),
|
||||||
|
'parents': item.get('parents', []),
|
||||||
|
'source': 'google_drive',
|
||||||
|
'is_folder': True
|
||||||
|
}
|
||||||
|
documents.append(Document(text="", doc_id=item.get('id'), extra_info=doc_metadata))
|
||||||
|
else:
|
||||||
|
doc = self._process_file(item, load_content=load_content)
|
||||||
|
if doc:
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
if limit and len(documents) >= limit:
|
||||||
|
return documents
|
||||||
|
|
||||||
|
page_token = results.get('nextPageToken')
|
||||||
|
if not page_token:
|
||||||
|
break
|
||||||
|
|
||||||
|
return documents
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error listing items under parent {parent_id}: {e}")
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _download_file_content(self, file_id: str, mime_type: str) -> Optional[str]:
|
||||||
|
if not self.credentials.token:
|
||||||
|
logging.warning("No access token in credentials, attempting to refresh")
|
||||||
|
if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
|
||||||
|
try:
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
self.credentials.refresh(Request())
|
||||||
|
logging.info("Credentials refreshed successfully")
|
||||||
|
self._ensure_service()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to refresh credentials: {e}")
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: missing or invalid refresh_token")
|
||||||
|
else:
|
||||||
|
logging.error("No access token and no refresh_token available")
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token")
|
||||||
|
|
||||||
|
if self.credentials.expired:
|
||||||
|
logging.warning("Credentials are expired, attempting to refresh")
|
||||||
|
if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
|
||||||
|
try:
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
self.credentials.refresh(Request())
|
||||||
|
logging.info("Credentials refreshed successfully")
|
||||||
|
self._ensure_service()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to refresh expired credentials: {e}")
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: expired credentials")
|
||||||
|
else:
|
||||||
|
logging.error("Credentials expired and no refresh_token available")
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if mime_type in self.EXPORT_FORMATS:
|
||||||
|
export_mime_type = self.EXPORT_FORMATS[mime_type]
|
||||||
|
request = self.service.files().export_media(
|
||||||
|
fileId=file_id,
|
||||||
|
mimeType=export_mime_type
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
request = self.service.files().get_media(fileId=file_id)
|
||||||
|
|
||||||
|
file_io = io.BytesIO()
|
||||||
|
downloader = MediaIoBaseDownload(file_io, request)
|
||||||
|
|
||||||
|
done = False
|
||||||
|
while done is False:
|
||||||
|
try:
|
||||||
|
_, done = downloader.next_chunk()
|
||||||
|
except HttpError as e:
|
||||||
|
logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error during download of file {file_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
content_bytes = file_io.getvalue()
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = content_bytes.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
content = content_bytes.decode('latin-1')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logging.error(f"Could not decode file {file_id} as text")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
except HttpError as e:
|
||||||
|
logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}")
|
||||||
|
|
||||||
|
if e.resp.status in [401, 403]:
|
||||||
|
logging.error(f"Authentication error downloading file {file_id}")
|
||||||
|
|
||||||
|
if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
|
||||||
|
logging.info(f"Attempting to refresh credentials for file {file_id}")
|
||||||
|
try:
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
self.credentials.refresh(Request())
|
||||||
|
logging.info("Credentials refreshed successfully")
|
||||||
|
self._credential_refreshed = True
|
||||||
|
self._ensure_service()
|
||||||
|
return None
|
||||||
|
except Exception as refresh_error:
|
||||||
|
logging.error(f"Error refreshing credentials: {refresh_error}")
|
||||||
|
raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}")
|
||||||
|
else:
|
||||||
|
logging.error("Cannot refresh credentials: missing refresh_token")
|
||||||
|
raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token")
|
||||||
|
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error downloading file {file_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _download_file_to_directory(self, file_id: str, local_dir: str) -> bool:
|
||||||
|
try:
|
||||||
|
self._ensure_service()
|
||||||
|
return self._download_single_file(file_id, local_dir)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error downloading file {file_id}: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _ensure_service(self):
|
||||||
|
if not self.service:
|
||||||
|
try:
|
||||||
|
self.service = self.auth.build_drive_service(self.credentials)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Cannot access Google Drive: {e}")
|
||||||
|
|
||||||
|
def _download_single_file(self, file_id: str, local_dir: str) -> bool:
|
||||||
|
file_metadata = self.service.files().get(
|
||||||
|
fileId=file_id,
|
||||||
|
fields='name,mimeType'
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
file_name = file_metadata['name']
|
||||||
|
mime_type = file_metadata['mimeType']
|
||||||
|
|
||||||
|
if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
os.makedirs(local_dir, exist_ok=True)
|
||||||
|
full_path = os.path.join(local_dir, file_name)
|
||||||
|
|
||||||
|
if mime_type in self.EXPORT_FORMATS:
|
||||||
|
export_mime_type = self.EXPORT_FORMATS[mime_type]
|
||||||
|
request = self.service.files().export_media(
|
||||||
|
fileId=file_id,
|
||||||
|
mimeType=export_mime_type
|
||||||
|
)
|
||||||
|
extension = self._get_extension_for_mime_type(export_mime_type)
|
||||||
|
if not full_path.endswith(extension):
|
||||||
|
full_path += extension
|
||||||
|
else:
|
||||||
|
request = self.service.files().get_media(fileId=file_id)
|
||||||
|
|
||||||
|
with open(full_path, 'wb') as f:
|
||||||
|
downloader = MediaIoBaseDownload(f, request)
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
_, done = downloader.next_chunk()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int:
|
||||||
|
files_downloaded = 0
|
||||||
|
query = f"'{folder_id}' in parents and trashed=false"
|
||||||
|
|
||||||
|
page_token = None
|
||||||
|
while True:
|
||||||
|
results = self.service.files().list(
|
||||||
|
q=query,
|
||||||
|
fields='nextPageToken,files(id,name,mimeType)',
|
||||||
|
pageToken=page_token
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
files = results.get('files', [])
|
||||||
|
|
||||||
|
for file_metadata in files:
|
||||||
|
if file_metadata['mimeType'] == 'application/vnd.google-apps.folder':
|
||||||
|
if recursive:
|
||||||
|
subfolder_path = os.path.join(local_dir, file_metadata['name'])
|
||||||
|
os.makedirs(subfolder_path, exist_ok=True)
|
||||||
|
files_downloaded += self._download_folder_recursive(
|
||||||
|
file_metadata['id'],
|
||||||
|
subfolder_path,
|
||||||
|
recursive
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if self._download_single_file(file_metadata['id'], local_dir):
|
||||||
|
files_downloaded += 1
|
||||||
|
|
||||||
|
page_token = results.get('nextPageToken')
|
||||||
|
if not page_token:
|
||||||
|
break
|
||||||
|
|
||||||
|
return files_downloaded
|
||||||
|
|
||||||
|
def _get_extension_for_mime_type(self, mime_type: str) -> str:
|
||||||
|
extensions = {
|
||||||
|
'application/pdf': '.pdf',
|
||||||
|
'text/plain': '.txt',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
|
||||||
|
'text/html': '.html',
|
||||||
|
'text/markdown': '.md',
|
||||||
|
}
|
||||||
|
return extensions.get(mime_type, '.bin')
|
||||||
|
|
||||||
|
def _download_folder_contents(self, folder_id: str, local_dir: str, recursive: bool = True) -> int:
|
||||||
|
try:
|
||||||
|
self._ensure_service()
|
||||||
|
return self._download_folder_recursive(folder_id, local_dir, recursive)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def download_to_directory(self, local_dir: str, source_config: dict = None) -> dict:
|
||||||
|
if source_config is None:
|
||||||
|
source_config = {}
|
||||||
|
|
||||||
|
config = source_config if source_config else getattr(self, 'config', {})
|
||||||
|
|
||||||
|
files_downloaded = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
folder_id = config.get('folder_id')
|
||||||
|
file_ids = config.get('file_ids', [])
|
||||||
|
recursive = config.get('recursive', True)
|
||||||
|
|
||||||
|
if file_ids:
|
||||||
|
if isinstance(file_ids, str):
|
||||||
|
file_ids = [file_ids]
|
||||||
|
|
||||||
|
for file_id in file_ids:
|
||||||
|
if self._download_file_to_directory(file_id, local_dir):
|
||||||
|
files_downloaded += 1
|
||||||
|
|
||||||
|
elif folder_id:
|
||||||
|
files_downloaded = self._download_folder_contents(folder_id, local_dir, recursive)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("No folder_id or file_ids provided for download")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"files_downloaded": files_downloaded,
|
||||||
|
"directory_path": local_dir,
|
||||||
|
"empty_result": files_downloaded == 0,
|
||||||
|
"source_type": "google_drive",
|
||||||
|
"config_used": config
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"files_downloaded": 0,
|
||||||
|
"directory_path": local_dir,
|
||||||
|
"empty_result": True,
|
||||||
|
"error": str(e),
|
||||||
|
"source_type": "google_drive"
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ from application.parser.remote.crawler_loader import CrawlerLoader
|
|||||||
from application.parser.remote.web_loader import WebLoader
|
from application.parser.remote.web_loader import WebLoader
|
||||||
from application.parser.remote.reddit_loader import RedditPostsLoaderRemote
|
from application.parser.remote.reddit_loader import RedditPostsLoaderRemote
|
||||||
from application.parser.remote.github_loader import GitHubLoader
|
from application.parser.remote.github_loader import GitHubLoader
|
||||||
|
from application.parser.remote.google_drive_loader import GoogleDriveLoader
|
||||||
|
|
||||||
|
|
||||||
class RemoteCreator:
|
class RemoteCreator:
|
||||||
@@ -12,11 +13,12 @@ class RemoteCreator:
|
|||||||
"crawler": CrawlerLoader,
|
"crawler": CrawlerLoader,
|
||||||
"reddit": RedditPostsLoaderRemote,
|
"reddit": RedditPostsLoaderRemote,
|
||||||
"github": GitHubLoader,
|
"github": GitHubLoader,
|
||||||
|
"google_drive": GoogleDriveLoader,
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_loader(cls, type, *args, **kwargs):
|
def create_loader(cls, type, *args, **kwargs):
|
||||||
loader_class = cls.loaders.get(type.lower())
|
loader_class = cls.loaders.get(type.lower())
|
||||||
if not loader_class:
|
if not loader_class:
|
||||||
raise ValueError(f"No LLM class found for type {type}")
|
raise ValueError(f"No loader class found for type {type}")
|
||||||
return loader_class(*args, **kwargs)
|
return loader_class(*args, **kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user