From b2b04268e9c6b52d9e8879c0640c796ba793b8c5 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 21 Aug 2025 02:46:32 +0530 Subject: [PATCH 01/25] (feat:drive) oauth flow --- application/parser/remote/google_auth.py | 336 +++++++++++++++++++++++ application/requirements.txt | 3 + 2 files changed, 339 insertions(+) create mode 100644 application/parser/remote/google_auth.py diff --git a/application/parser/remote/google_auth.py b/application/parser/remote/google_auth.py new file mode 100644 index 00000000..2b736e69 --- /dev/null +++ b/application/parser/remote/google_auth.py @@ -0,0 +1,336 @@ +import logging +import time +import datetime +from typing import Optional, Dict, Any + +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import Flow +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from application.core.settings import settings + + +class GoogleDriveAuth: + """ + Handles Google OAuth 2.0 authentication for Google Drive access. + """ + + SCOPES = [ + 'https://www.googleapis.com/auth/drive.readonly', + 'https://www.googleapis.com/auth/drive.metadata.readonly' + ] + + def __init__(self): + self.client_id = settings.GOOGLE_CLIENT_ID + self.client_secret = settings.GOOGLE_CLIENT_SECRET + self.redirect_uri = settings.GOOGLE_REDIRECT_URI or "http://localhost:7091/api/google-drive/callback" + + if not self.client_id or not self.client_secret: + raise ValueError("Google OAuth credentials not configured. Please set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET in settings.") + + def get_authorization_url(self, state: Optional[str] = None) -> str: + """ + Generate Google OAuth authorization URL. + + Args: + state: Optional state parameter for CSRF protection + + Returns: + Authorization URL for Google OAuth flow + """ + try: + flow = Flow.from_client_config( + { + "web": { + "client_id": self.client_id, + "client_secret": self.client_secret, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "redirect_uris": [self.redirect_uri] + } + }, + scopes=self.SCOPES + ) + flow.redirect_uri = self.redirect_uri + + authorization_url, _ = flow.authorization_url( + access_type='offline', + prompt='consent', + include_granted_scopes='true', + state=state + ) + + return authorization_url + + except Exception as e: + logging.error(f"Error generating authorization URL: {e}") + raise + + def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: + """ + Exchange authorization code for access and refresh tokens. + + Args: + authorization_code: Authorization code from OAuth callback + + Returns: + Dictionary containing token information + """ + try: + if not authorization_code: + raise ValueError("Authorization code is required") + + flow = Flow.from_client_config( + { + "web": { + "client_id": self.client_id, + "client_secret": self.client_secret, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "redirect_uris": [self.redirect_uri] + } + }, + scopes=self.SCOPES + ) + flow.redirect_uri = self.redirect_uri + + flow.fetch_token(code=authorization_code) + + credentials = flow.credentials + + if credentials.expiry: + try: + expiry = credentials.expiry + if expiry.tzinfo is None: + # If expiry is offset-naive, make it offset-aware + expiry = expiry.replace(tzinfo=datetime.timezone.utc) + + current_time = datetime.datetime.now(datetime.timezone.utc) + time_until_expiry = expiry - current_time + logging.info(f"Token expires in: {time_until_expiry}") + except Exception as e: + logging.warning(f"Error calculating token expiry: {e}") + else: + logging.info("Token has no expiry information") + + if not credentials.refresh_token: + logging.warning("OAuth flow did not return a refresh_token.") + if not credentials.token: + raise ValueError("OAuth flow did not return an access token") + + if not credentials.token_uri: + credentials.token_uri = "https://oauth2.googleapis.com/token" + + if not credentials.client_id: + credentials.client_id = self.client_id + + if not credentials.client_secret: + credentials.client_secret = self.client_secret + + if credentials.expiry: + try: + expiry_dt = credentials.expiry + if expiry_dt.tzinfo is None: # Ensure UTC timezone + expiry_dt = expiry_dt.replace(tzinfo=datetime.timezone.utc) + + current_time = datetime.datetime.now(datetime.timezone.utc) + time_until_expiry = expiry_dt - current_time + logging.info(f"Access token expires in {time_until_expiry}") + except Exception as e: + logging.warning(f"Error calculating token expiry: {e}") + + if not credentials.refresh_token: + raise ValueError( + "No refresh token received. This typically happens when offline access wasn't granted. " + ) + + expiry_iso = None + if credentials.expiry: + expiry_iso = credentials.expiry.isoformat() + + return { + 'access_token': credentials.token, + 'refresh_token': credentials.refresh_token, + 'token_uri': credentials.token_uri, + 'client_id': credentials.client_id, + 'client_secret': credentials.client_secret, + 'scopes': credentials.scopes, + 'expiry': expiry_iso + } + + except Exception as e: + logging.error(f"Error exchanging code for tokens: {e}") + raise + + def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: + try: + if not refresh_token: + raise ValueError("Refresh token is required") + + credentials = Credentials( + token=None, + refresh_token=refresh_token, + token_uri="https://oauth2.googleapis.com/token", + client_id=self.client_id, + client_secret=self.client_secret + ) + + from google.auth.transport.requests import Request + credentials.refresh(Request()) + + expiry_iso = None + if credentials.expiry: + expiry_iso = credentials.expiry.isoformat() + + return { + 'access_token': credentials.token, + 'refresh_token': refresh_token, + 'token_uri': credentials.token_uri, + 'client_id': credentials.client_id, + 'client_secret': credentials.client_secret, + 'scopes': credentials.scopes, + 'expiry': expiry_iso + } + except Exception as e: + logging.error(f"Error refreshing access token: {e}", exc_info=True) + raise + + def create_credentials_from_token_info(self, token_info: Dict[str, Any]) -> Credentials: + from application.core.settings import settings + + access_token = token_info.get('access_token') + if not access_token: + raise ValueError("No access token found in token_info") + + credentials = Credentials( + token=access_token, + refresh_token=token_info.get('refresh_token'), + token_uri= 'https://oauth2.googleapis.com/token', + client_id=settings.GOOGLE_CLIENT_ID, + client_secret=settings.GOOGLE_CLIENT_SECRET, + scopes=token_info.get('scopes', ['https://www.googleapis.com/auth/drive.readonly']) + ) + + if not credentials.token: + raise ValueError("Credentials created without valid access token") + + return credentials + + def build_drive_service(self, credentials: Credentials): + try: + if not credentials: + raise ValueError("No credentials provided") + + if not credentials.token and not credentials.refresh_token: + raise ValueError("No access token or refresh token available. User must re-authorize with offline access.") + + needs_refresh = credentials.expired or not credentials.token + if needs_refresh: + if credentials.refresh_token: + try: + from google.auth.transport.requests import Request + credentials.refresh(Request()) + except Exception as refresh_error: + raise ValueError(f"Failed to refresh credentials: {refresh_error}") + else: + raise ValueError("No access token or refresh token available. User must re-authorize with offline access.") + + return build('drive', 'v3', credentials=credentials) + + except HttpError as e: + raise ValueError(f"Failed to build Google Drive service: HTTP {e.resp.status}") + except Exception as e: + raise ValueError(f"Failed to build Google Drive service: {str(e)}") + + def is_token_expired(self, token_info): + if 'expiry' in token_info and token_info['expiry']: + try: + import datetime as dt + from dateutil import parser + + expiry_input = token_info['expiry'] + + if isinstance(expiry_input, str): + # Parse ISO format string + expiry_dt = parser.parse(expiry_input) + elif isinstance(expiry_input, dt.datetime): + expiry_dt = expiry_input + else: + logging.warning(f"Unexpected expiry format: {type(expiry_input)}") + return True + + # Ensure UTC timezone + if expiry_dt.tzinfo is None: + expiry_dt = expiry_dt.replace(tzinfo=dt.timezone.utc) + + current_time = dt.datetime.now(dt.timezone.utc) + + return current_time >= expiry_dt - dt.timedelta(seconds=60) + + except Exception: + return True + + if 'access_token' in token_info and token_info['access_token']: + return False + + return True + + def get_token_info_from_session(self, session_token: str) -> Dict[str, Any]: + try: + from application.core.mongo_db import MongoDB + from application.core.settings import settings + + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + sessions_collection = db["drive_sessions"] + + session = sessions_collection.find_one({"session_token": session_token}) + if not session: + raise ValueError(f"Invalid session token: {session_token}") + + if "token_info" not in session: + raise ValueError("Session missing token information") + + token_info = session["token_info"] + if not token_info: + raise ValueError("Invalid token information") + + required_fields = ["access_token", "refresh_token"] + missing_fields = [field for field in required_fields if field not in token_info or not token_info.get(field)] + if missing_fields: + raise ValueError(f"Missing required token fields: {missing_fields}") + + if 'client_id' not in token_info: + token_info['client_id'] = settings.GOOGLE_CLIENT_ID + if 'client_secret' not in token_info: + token_info['client_secret'] = settings.GOOGLE_CLIENT_SECRET + if 'token_uri' not in token_info: + token_info['token_uri'] = 'https://oauth2.googleapis.com/token' + + return token_info + + except Exception as e: + raise ValueError(f"Failed to retrieve Google Drive token information: {str(e)}") + + def validate_credentials(self, credentials: Credentials) -> bool: + """ + Validate Google Drive credentials by making a test API call. + + Args: + credentials: Google credentials object + + Returns: + True if credentials are valid, False otherwise + """ + try: + service = self.build_drive_service(credentials) + service.about().get(fields="user").execute() + return True + + except HttpError as e: + logging.error(f"HTTP error validating credentials: {e}") + return False + except Exception as e: + logging.error(f"Error validating credentials: {e}") + return False diff --git a/application/requirements.txt b/application/requirements.txt index 3778d941..b7076ed8 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -13,6 +13,9 @@ Flask==3.1.1 faiss-cpu==1.9.0.post1 flask-restx==1.3.0 google-genai==1.3.0 +google-api-python-client==2.179.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.2 gTTS==2.5.4 gunicorn==23.0.0 javalang==0.13.0 From 3b69bea23d4ce8c2efe731daba9b6f32377c33d9 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 21 Aug 2025 17:02:23 +0530 Subject: [PATCH 02/25] (chore:settings)addefault oath creds --- application/core/settings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/application/core/settings.py b/application/core/settings.py index 9303b996..f1000056 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -40,6 +40,11 @@ class Settings(BaseSettings): FALLBACK_LLM_NAME: Optional[str] = None # model name for fallback llm FALLBACK_LLM_API_KEY: Optional[str] = None # api key for fallback llm + # Google Drive integration + GOOGLE_CLIENT_ID: Optional[str] = None # Replace with your actual Google OAuth client ID + GOOGLE_CLIENT_SECRET: Optional[str] = None# Replace with your actual Google OAuth client secret + GOOGLE_REDIRECT_URI: Optional[str] = None + # LLM Cache CACHE_REDIS_URL: str = "redis://localhost:6379/2" From c2bebbaefaf48e77a4f4f9a8574a709a58e3e78b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 03:29:23 +0530 Subject: [PATCH 03/25] (feat:oauth/drive) raw fe integrate --- frontend/public/google-drive-callback.html | 117 ++++++ frontend/src/upload/Upload.tsx | 442 ++++++++++++++++++++- frontend/src/upload/types/ingestor.ts | 28 +- 3 files changed, 563 insertions(+), 24 deletions(-) create mode 100644 frontend/public/google-drive-callback.html diff --git a/frontend/public/google-drive-callback.html b/frontend/public/google-drive-callback.html new file mode 100644 index 00000000..0272af9a --- /dev/null +++ b/frontend/public/google-drive-callback.html @@ -0,0 +1,117 @@ + + + + Google Drive Authentication + + + +
+

Google Drive Authentication

+
Processing authentication...
+
+ + + + diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 420427d4..75749f29 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -25,6 +25,8 @@ import { IngestorFormSchemas, IngestorType, } from './types/ingestor'; +import FileIcon from '../assets/file.svg'; +import FolderIcon from '../assets/folder.svg'; function Upload({ receivedFile = [], @@ -48,6 +50,15 @@ function Upload({ const [activeTab, setActiveTab] = useState(renderTab); const [showAdvancedOptions, setShowAdvancedOptions] = useState(false); + // Google Drive state + const [isGoogleDriveConnected, setIsGoogleDriveConnected] = useState(false); + const [googleDriveFiles, setGoogleDriveFiles] = useState([]); + const [selectedFiles, setSelectedFiles] = useState([]); + const [isLoadingFiles, setIsLoadingFiles] = useState(false); + const [isAuthenticating, setIsAuthenticating] = useState(false); + const [userEmail, setUserEmail] = useState(''); + const [authError, setAuthError] = useState(''); + const renderFormFields = () => { const schema = IngestorFormSchemas[ingestor.type]; if (!schema) return null; @@ -204,6 +215,7 @@ function Upload({ { label: 'Link', value: 'url' }, { label: 'GitHub', value: 'github' }, { label: 'Reddit', value: 'reddit' }, + { label: 'Google Drive', value: 'google_drive' }, ]; const sourceDocs = useSelector(selectSourceDocs); @@ -428,29 +440,40 @@ function Upload({ formData.append('user', 'local'); formData.append('source', ingestor.type); - const defaultConfig = IngestorDefaultConfigs[ingestor.type].config; + let configData; - const mergedConfig = { ...defaultConfig, ...ingestor.config }; - const filteredConfig = Object.entries(mergedConfig).reduce( - (acc, [key, value]) => { - const field = IngestorFormSchemas[ingestor.type].find( - (f) => f.name === key, - ); - // Include the field if: - // 1. It's required, or - // 2. It's optional and has a non-empty value - if ( - field?.required || - (value !== undefined && value !== null && value !== '') - ) { - acc[key] = value; - } - return acc; - }, - {} as Record, - ); + if (ingestor.type === 'google_drive') { + const sessionToken = localStorage.getItem('google_drive_session_token'); + + configData = { + file_ids: selectedFiles, + recursive: ingestor.config.recursive, + session_token: sessionToken || null + }; + } else { + const defaultConfig = IngestorDefaultConfigs[ingestor.type].config; + const mergedConfig = { ...defaultConfig, ...ingestor.config }; + configData = Object.entries(mergedConfig).reduce( + (acc, [key, value]) => { + const field = IngestorFormSchemas[ingestor.type].find( + (f) => f.name === key, + ); + // Include the field if: + // 1. It's required, or + // 2. It's optional and has a non-empty value + if ( + field?.required || + (value !== undefined && value !== null && value !== '') + ) { + acc[key] = value; + } + return acc; + }, + {} as Record, + ); + } - formData.append('data', JSON.stringify(filteredConfig)); + formData.append('data', JSON.stringify(configData)); const apiHost: string = import.meta.env.VITE_API_HOST; const xhr = new XMLHttpRequest(); @@ -477,6 +500,233 @@ function Upload({ xhr.setRequestHeader('Authorization', `Bearer ${token}`); xhr.send(formData); }; + + useEffect(() => { + if (ingestor.type === 'google_drive') { + const sessionToken = localStorage.getItem('google_drive_session_token'); + + if (sessionToken) { + // Auto-authenticate if session token exists + setIsGoogleDriveConnected(true); + setAuthError(''); + + // Fetch user email and files using the existing session token + fetchUserEmailAndLoadFiles(sessionToken); + } + } + }, [ingestor.type]); + + const fetchUserEmailAndLoadFiles = async (sessionToken: string) => { + try { + const apiHost = import.meta.env.VITE_API_HOST; + + const validateResponse = await fetch(`${apiHost}/api/google-drive/validate-session`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${token}` + }, + body: JSON.stringify({ session_token: sessionToken }) + }); + + if (!validateResponse.ok) { + localStorage.removeItem('google_drive_session_token'); + setIsGoogleDriveConnected(false); + setAuthError('Session expired. Please reconnect to Google Drive.'); + return; + } + + const validateData = await validateResponse.json(); + + if (validateData.success) { + setUserEmail(validateData.user_email || 'Connected User'); + loadGoogleDriveFiles(sessionToken); + } else { + localStorage.removeItem('google_drive_session_token'); + setIsGoogleDriveConnected(false); + setAuthError(validateData.error || 'Session expired. Please reconnect your Google Drive account and make sure to grant offline access.'); + } + } catch (error) { + console.error('Error validating Google Drive session:', error); + setAuthError('Failed to validate session. Please reconnect.'); + setIsGoogleDriveConnected(false); + } + }; + + const handleGoogleDriveConnect = async () => { + console.log('Google Drive connect button clicked'); + setIsAuthenticating(true); + setAuthError(''); + + const existingToken = localStorage.getItem('google_drive_session_token'); + if (existingToken) { + fetchUserEmailAndLoadFiles(existingToken); + setIsAuthenticating(false); + return; + } + + try { + const apiHost = import.meta.env.VITE_API_HOST; + + const authResponse = await fetch(`${apiHost}/api/google-drive/auth`, { + headers: { + 'Authorization': `Bearer ${token}` + } + }); + + if (!authResponse.ok) { + throw new Error(`Failed to get authorization URL: ${authResponse.status}`); + } + + const authData = await authResponse.json(); + + if (!authData.success || !authData.authorization_url) { + throw new Error(authData.error || 'Failed to get authorization URL'); + } + + console.log('Opening Google OAuth window...'); + + const authWindow = window.open( + authData.authorization_url, + 'google-drive-auth', + 'width=500,height=600,scrollbars=yes,resizable=yes' + ); + + if (!authWindow) { + throw new Error('Failed to open authentication window. Please allow popups.'); + } + + const handleAuthMessage = (event: MessageEvent) => { + console.log('Received message event:', event.data); + + if (event.data.type === 'google_drive_auth_success') { + console.log('OAuth success received:', event.data); + setUserEmail(event.data.user_email || 'Connected User'); + setIsGoogleDriveConnected(true); + setIsAuthenticating(false); + setAuthError(''); + + if (event.data.session_token) { + localStorage.setItem('google_drive_session_token', event.data.session_token); + } + + window.removeEventListener('message', handleAuthMessage); + + loadGoogleDriveFiles(event.data.session_token); + } else if (event.data.type === 'google_drive_auth_error') { + console.error('OAuth error received:', event.data); + setAuthError(event.data.error || 'Authentication failed. Please make sure to grant all requested permissions, including offline access. You may need to revoke previous access and re-authorize.'); + setIsAuthenticating(false); + setIsGoogleDriveConnected(false); + window.removeEventListener('message', handleAuthMessage); + } + }; + + window.addEventListener('message', handleAuthMessage); + const checkClosed = setInterval(() => { + if (authWindow.closed) { + clearInterval(checkClosed); + window.removeEventListener('message', handleAuthMessage); + + if (!isGoogleDriveConnected && !isAuthenticating) { + setAuthError('Authentication was cancelled'); + } + } + }, 1000); + + } catch (error) { + console.error('Error during Google Drive authentication:', error); + setAuthError(error instanceof Error ? error.message : 'Authentication failed'); + setIsAuthenticating(false); + } + }; + + const loadGoogleDriveFiles = async (sessionToken: string) => { + setIsLoadingFiles(true); + + try { + const apiHost = import.meta.env.VITE_API_HOST; + const filesResponse = await fetch(`${apiHost}/api/google-drive/files`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${token}` + }, + body: JSON.stringify({ + session_token: sessionToken, + limit: 50 + }) + }); + + if (!filesResponse.ok) { + throw new Error(`Failed to load files: ${filesResponse.status}`); + } + + const filesData = await filesResponse.json(); + + if (filesData.success && filesData.files) { + setGoogleDriveFiles(filesData.files); + } else { + throw new Error(filesData.error || 'Failed to load files'); + } + + } catch (error) { + console.error('Error loading Google Drive files:', error); + setAuthError(error instanceof Error ? error.message : 'Failed to load files. Please make sure your Google Drive account is properly connected and you granted offline access during authorization.'); + + // Fallback to mock data for demo purposes + console.log('Using mock data as fallback...'); + const mockFiles = [ + { + id: '1', + name: 'Project Documentation.pdf', + type: 'application/pdf', + size: '2.5 MB', + modifiedTime: '2024-01-15', + iconUrl: '�' + }, + { + id: '2', + name: 'Meeting Notes.docx', + type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + size: '1.2 MB', + modifiedTime: '2024-01-14', + iconUrl: '�' + }, + { + id: '3', + name: 'Presentation.pptx', + type: 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + size: '5.8 MB', + modifiedTime: '2024-01-13', + iconUrl: '�' + } + ]; + setGoogleDriveFiles(mockFiles); + } finally { + setIsLoadingFiles(false); + } + }; + + // Handle file selection + const handleFileSelect = (fileId: string) => { + setSelectedFiles(prev => { + if (prev.includes(fileId)) { + return prev.filter(id => id !== fileId); + } else { + return [...prev, fileId]; + } + }); + }; + + const handleSelectAll = () => { + if (selectedFiles.length === googleDriveFiles.length) { + setSelectedFiles([]); + } else { + setSelectedFiles(googleDriveFiles.map(file => file.id)); + } + }; + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, multiple: true, @@ -515,6 +765,10 @@ function Upload({ if (!remoteName?.trim()) { return true; } + if (ingestor.type === 'google_drive') { + return !isGoogleDriveConnected || selectedFiles.length === 0; + } + const formFields: FormField[] = IngestorFormSchemas[ingestor.type]; for (const field of formFields) { if (field.required) { @@ -679,6 +933,147 @@ function Upload({ required={true} labelBgClassName="bg-white dark:bg-charleston-green-2" /> + {ingestor.type === 'google_drive' && ( +
+ {authError && ( +
+

+ ⚠️ {authError} +

+
+ )} + + {!isGoogleDriveConnected ? ( + + ) : ( +
+ {/* Connection Status */} +
+
+ + + + Connected as {userEmail} +
+ +
+ + {/* File Browser */} +
+
+
+

+ Select Files from Google Drive +

+ {googleDriveFiles.length > 0 && ( + + )} +
+ {selectedFiles.length > 0 && ( +

+ {selectedFiles.length} file{selectedFiles.length !== 1 ? 's' : ''} selected +

+ )} +
+ +
+ {isLoadingFiles ? ( +
+
+
+ Loading files... +
+
+ ) : googleDriveFiles.length === 0 ? ( +
+ No files found in your Google Drive +
+ ) : ( +
+ {googleDriveFiles.map((file) => ( +
handleFileSelect(file.id)} + > +
+
+ handleFileSelect(file.id)} + className="h-4 w-4 text-blue-600 rounded border-gray-300 focus:ring-blue-500" + /> +
+
{file.iconUrl}
+
+

+ {file.name} +

+

+ {file.size} • Modified {file.modifiedTime} +

+
+
+
+ ))} +
+ )} +
+
+
+ )} +
+ )} + {renderFormFields()} {IngestorFormSchemas[ingestor.type].some( (field) => field.advanced, @@ -719,7 +1114,10 @@ function Upload({ : 'bg-purple-30 hover:bg-violets-are-blue cursor-pointer text-white' }`} > - {t('modals.uploadDoc.train')} + {ingestor.type === 'google_drive' && selectedFiles.length > 0 + ? `Train with ${selectedFiles.length} file${selectedFiles.length !== 1 ? 's' : ''}` + : t('modals.uploadDoc.train') + } )} diff --git a/frontend/src/upload/types/ingestor.ts b/frontend/src/upload/types/ingestor.ts index cd709847..f5c29dee 100644 --- a/frontend/src/upload/types/ingestor.ts +++ b/frontend/src/upload/types/ingestor.ts @@ -22,7 +22,14 @@ export interface UrlIngestorConfig extends BaseIngestorConfig { url: string; } -export type IngestorType = 'crawler' | 'github' | 'reddit' | 'url'; +export interface GoogleDriveIngestorConfig extends BaseIngestorConfig { + folder_id?: string; + file_ids?: string; + recursive?: boolean; + token_info?: any; +} + +export type IngestorType = 'crawler' | 'github' | 'reddit' | 'url' | 'google_drive'; export interface IngestorConfig { type: IngestorType; @@ -31,7 +38,8 @@ export interface IngestorConfig { | RedditIngestorConfig | GithubIngestorConfig | CrawlerIngestorConfig - | UrlIngestorConfig; + | UrlIngestorConfig + | GoogleDriveIngestorConfig; } export type IngestorFormData = { @@ -109,6 +117,14 @@ export const IngestorFormSchemas: Record = { required: true, }, ], + google_drive: [ + { + name: 'recursive', + label: 'Include subfolders', + type: 'boolean', + required: false, + }, + ], }; export const IngestorDefaultConfigs: Record< @@ -143,4 +159,12 @@ export const IngestorDefaultConfigs: Record< repo_url: '', } as GithubIngestorConfig, }, + google_drive: { + name: '', + config: { + folder_id: '', + file_ids: '', + recursive: true, + } as GoogleDriveIngestorConfig, + }, }; From 8c3f75e3e20f5a2eb90a30c342bae4ac3ee6581d Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 13:32:40 +0530 Subject: [PATCH 04/25] (feat:ingestion) google drive loader --- .../parser/remote/google_drive_loader.py | 501 ++++++++++++++++++ application/parser/remote/remote_creator.py | 4 +- 2 files changed, 504 insertions(+), 1 deletion(-) create mode 100644 application/parser/remote/google_drive_loader.py diff --git a/application/parser/remote/google_drive_loader.py b/application/parser/remote/google_drive_loader.py new file mode 100644 index 00000000..b2be6c4c --- /dev/null +++ b/application/parser/remote/google_drive_loader.py @@ -0,0 +1,501 @@ +""" +Google Drive loader for DocsGPT. +Loads documents from Google Drive using Google Drive API. +""" + +import io +import logging +import os +from typing import List, Dict, Any, Optional + +from googleapiclient.http import MediaIoBaseDownload +from googleapiclient.errors import HttpError + +from application.parser.remote.base import BaseRemote +from application.parser.remote.google_auth import GoogleDriveAuth +from application.parser.schema.base import Document + + +class GoogleDriveLoader(BaseRemote): + + SUPPORTED_MIME_TYPES = { + 'application/pdf': '.pdf', + 'application/vnd.google-apps.document': '.docx', + 'application/vnd.google-apps.presentation': '.pptx', + 'application/vnd.google-apps.spreadsheet': '.xlsx', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', + 'application/msword': '.doc', + 'application/vnd.ms-powerpoint': '.ppt', + 'application/vnd.ms-excel': '.xls', + 'text/plain': '.txt', + 'text/csv': '.csv', + 'text/html': '.html', + 'application/rtf': '.rtf', + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/png': '.png', + } + + EXPORT_FORMATS = { + 'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.google-apps.presentation': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/vnd.google-apps.spreadsheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + } + + def __init__(self, session_token: str): + self.auth = GoogleDriveAuth() + self.session_token = session_token + + token_info = self.auth.get_token_info_from_session(session_token) + self.credentials = self.auth.create_credentials_from_token_info(token_info) + + try: + self.service = self.auth.build_drive_service(self.credentials) + except Exception as e: + logging.warning(f"Could not build Google Drive service: {e}") + self.service = None + + + + def _process_file(self, file_metadata: Dict[str, Any], load_content: bool = True) -> Optional[Document]: + try: + file_id = file_metadata.get('id') + file_name = file_metadata.get('name', 'Unknown') + mime_type = file_metadata.get('mimeType', 'application/octet-stream') + + if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): + return None + if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): + logging.info(f"Skipping unsupported file type: {mime_type} for file {file_name}") + return None + # Google Drive provides timezone-aware ISO8601 dates + doc_metadata = { + 'file_name': file_name, + 'mime_type': mime_type, + 'size': file_metadata.get('size', 'Unknown'), + 'created_time': file_metadata.get('createdTime'), + 'modified_time': file_metadata.get('modifiedTime'), + 'parents': file_metadata.get('parents', []), + 'source': 'google_drive' + } + + if not load_content: + return Document( + text="", + doc_id=file_id, + extra_info=doc_metadata + ) + + content = self._download_file_content(file_id, mime_type) + if content is None: + logging.warning(f"Could not load content for file {file_name} ({file_id})") + return None + + return Document( + text=content, + doc_id=file_id, + extra_info=doc_metadata + ) + + except Exception as e: + logging.error(f"Error processing file: {e}") + return None + + def load_data(self, inputs: Dict[str, Any]) -> List[Document]: + """ + Load items from Google Drive according to simple browsing semantics. + + Behavior: + - If file_ids are provided: return those files (optionally with content). + - If folder_id is provided: return the immediate children (folders and files) of that folder. + - If no folder_id: return the immediate children (folders and files) of Drive 'root'. + + Args: + inputs: Dictionary containing configuration: + - folder_id: Optional Google Drive folder ID whose direct children to list + - file_ids: Optional list of specific file IDs to load + - limit: Maximum number of items to return + - list_only: If True, only return metadata without content + - session_token: Optional session token to use for authentication (backward compatibility) + + Returns: + List of Document objects (folders are returned as metadata-only documents) + """ + session_token = inputs.get('session_token') + if session_token and session_token != self.session_token: + logging.warning("Session token in inputs differs from loader's session token. Using loader's session token.") + self.config = inputs + + try: + documents: List[Document] = [] + + folder_id = inputs.get('folder_id') + file_ids = inputs.get('file_ids', []) + limit = inputs.get('limit', 100) + list_only = inputs.get('list_only', False) + load_content = not list_only + + if file_ids: + # Specific files requested: load them + for file_id in file_ids: + try: + doc = self._load_file_by_id(file_id, load_content=load_content) + if doc: + documents.append(doc) + elif hasattr(self, '_credential_refreshed') and self._credential_refreshed: + self._credential_refreshed = False + logging.info(f"Retrying load of file {file_id} after credential refresh") + doc = self._load_file_by_id(file_id, load_content=load_content) + if doc: + documents.append(doc) + except Exception as e: + logging.error(f"Error loading file {file_id}: {e}") + continue + else: + # Browsing mode: list immediate children of provided folder or root + parent_id = folder_id if folder_id else 'root' + documents = self._list_items_in_parent(parent_id, limit=limit, load_content=load_content) + + logging.info(f"Loaded {len(documents)} documents from Google Drive") + return documents + + except Exception as e: + logging.error(f"Error loading data from Google Drive: {e}", exc_info=True) + raise + + def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]: + self._ensure_service() + + try: + file_metadata = self.service.files().get( + fileId=file_id, + fields='id,name,mimeType,size,createdTime,modifiedTime,parents' + ).execute() + + return self._process_file(file_metadata, load_content=load_content) + + except HttpError as e: + logging.error(f"HTTP error loading file {file_id}: {e.resp.status} - {e.content}") + + if e.resp.status in [401, 403]: + if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: + try: + from google.auth.transport.requests import Request + self.credentials.refresh(Request()) + self._ensure_service() + return None + except Exception as refresh_error: + raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}") + else: + raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") + + return None + except Exception as e: + logging.error(f"Error loading file {file_id}: {e}") + return None + + + def _list_items_in_parent(self, parent_id: str, limit: int = 100, load_content: bool = False) -> List[Document]: + self._ensure_service() + + documents: List[Document] = [] + + try: + query = f"'{parent_id}' in parents and trashed=false" + page_token = None + + while True: + page_size = 100 + if limit: + remaining = max(0, limit - len(documents)) + if remaining == 0: + break + page_size = min(100, remaining) + + results = self.service.files().list( + q=query, + fields='nextPageToken,files(id,name,mimeType,size,createdTime,modifiedTime,parents)', + pageToken=page_token, + pageSize=page_size + ).execute() + + items = results.get('files', []) + for item in items: + mime_type = item.get('mimeType') + if mime_type == 'application/vnd.google-apps.folder': + doc_metadata = { + 'file_name': item.get('name', 'Unknown'), + 'mime_type': mime_type, + 'size': item.get('size', 'Unknown'), + 'created_time': item.get('createdTime'), + 'modified_time': item.get('modifiedTime'), + 'parents': item.get('parents', []), + 'source': 'google_drive', + 'is_folder': True + } + documents.append(Document(text="", doc_id=item.get('id'), extra_info=doc_metadata)) + else: + doc = self._process_file(item, load_content=load_content) + if doc: + documents.append(doc) + + if limit and len(documents) >= limit: + return documents + + page_token = results.get('nextPageToken') + if not page_token: + break + + return documents + except Exception as e: + logging.error(f"Error listing items under parent {parent_id}: {e}") + return documents + + + + + def _download_file_content(self, file_id: str, mime_type: str) -> Optional[str]: + if not self.credentials.token: + logging.warning("No access token in credentials, attempting to refresh") + if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: + try: + from google.auth.transport.requests import Request + self.credentials.refresh(Request()) + logging.info("Credentials refreshed successfully") + self._ensure_service() + except Exception as e: + logging.error(f"Failed to refresh credentials: {e}") + raise ValueError("Authentication failed and cannot be refreshed: missing or invalid refresh_token") + else: + logging.error("No access token and no refresh_token available") + raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") + + if self.credentials.expired: + logging.warning("Credentials are expired, attempting to refresh") + if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: + try: + from google.auth.transport.requests import Request + self.credentials.refresh(Request()) + logging.info("Credentials refreshed successfully") + self._ensure_service() + except Exception as e: + logging.error(f"Failed to refresh expired credentials: {e}") + raise ValueError("Authentication failed and cannot be refreshed: expired credentials") + else: + logging.error("Credentials expired and no refresh_token available") + raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") + + try: + if mime_type in self.EXPORT_FORMATS: + export_mime_type = self.EXPORT_FORMATS[mime_type] + request = self.service.files().export_media( + fileId=file_id, + mimeType=export_mime_type + ) + else: + request = self.service.files().get_media(fileId=file_id) + + file_io = io.BytesIO() + downloader = MediaIoBaseDownload(file_io, request) + + done = False + while done is False: + try: + _, done = downloader.next_chunk() + except HttpError as e: + logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}") + return None + except Exception as e: + logging.error(f"Error during download of file {file_id}: {e}") + return None + + content_bytes = file_io.getvalue() + + try: + content = content_bytes.decode('utf-8') + except UnicodeDecodeError: + try: + content = content_bytes.decode('latin-1') + except UnicodeDecodeError: + logging.error(f"Could not decode file {file_id} as text") + return None + + return content + + except HttpError as e: + logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}") + + if e.resp.status in [401, 403]: + logging.error(f"Authentication error downloading file {file_id}") + + if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: + logging.info(f"Attempting to refresh credentials for file {file_id}") + try: + from google.auth.transport.requests import Request + self.credentials.refresh(Request()) + logging.info("Credentials refreshed successfully") + self._credential_refreshed = True + self._ensure_service() + return None + except Exception as refresh_error: + logging.error(f"Error refreshing credentials: {refresh_error}") + raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}") + else: + logging.error("Cannot refresh credentials: missing refresh_token") + raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") + + return None + except Exception as e: + logging.error(f"Error downloading file {file_id}: {e}") + return None + + + def _download_file_to_directory(self, file_id: str, local_dir: str) -> bool: + try: + self._ensure_service() + return self._download_single_file(file_id, local_dir) + except Exception as e: + logging.error(f"Error downloading file {file_id}: {e}", exc_info=True) + return False + + def _ensure_service(self): + if not self.service: + try: + self.service = self.auth.build_drive_service(self.credentials) + except Exception as e: + raise ValueError(f"Cannot access Google Drive: {e}") + + def _download_single_file(self, file_id: str, local_dir: str) -> bool: + file_metadata = self.service.files().get( + fileId=file_id, + fields='name,mimeType' + ).execute() + + file_name = file_metadata['name'] + mime_type = file_metadata['mimeType'] + + if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): + return False + + os.makedirs(local_dir, exist_ok=True) + full_path = os.path.join(local_dir, file_name) + + if mime_type in self.EXPORT_FORMATS: + export_mime_type = self.EXPORT_FORMATS[mime_type] + request = self.service.files().export_media( + fileId=file_id, + mimeType=export_mime_type + ) + extension = self._get_extension_for_mime_type(export_mime_type) + if not full_path.endswith(extension): + full_path += extension + else: + request = self.service.files().get_media(fileId=file_id) + + with open(full_path, 'wb') as f: + downloader = MediaIoBaseDownload(f, request) + done = False + while not done: + _, done = downloader.next_chunk() + + return True + + def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: + files_downloaded = 0 + query = f"'{folder_id}' in parents and trashed=false" + + page_token = None + while True: + results = self.service.files().list( + q=query, + fields='nextPageToken,files(id,name,mimeType)', + pageToken=page_token + ).execute() + + files = results.get('files', []) + + for file_metadata in files: + if file_metadata['mimeType'] == 'application/vnd.google-apps.folder': + if recursive: + subfolder_path = os.path.join(local_dir, file_metadata['name']) + os.makedirs(subfolder_path, exist_ok=True) + files_downloaded += self._download_folder_recursive( + file_metadata['id'], + subfolder_path, + recursive + ) + else: + if self._download_single_file(file_metadata['id'], local_dir): + files_downloaded += 1 + + page_token = results.get('nextPageToken') + if not page_token: + break + + return files_downloaded + + def _get_extension_for_mime_type(self, mime_type: str) -> str: + extensions = { + 'application/pdf': '.pdf', + 'text/plain': '.txt', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'text/html': '.html', + 'text/markdown': '.md', + } + return extensions.get(mime_type, '.bin') + + def _download_folder_contents(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: + try: + self._ensure_service() + return self._download_folder_recursive(folder_id, local_dir, recursive) + except Exception as e: + logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) + return 0 + + def download_to_directory(self, local_dir: str, source_config: dict = None) -> dict: + if source_config is None: + source_config = {} + + config = source_config if source_config else getattr(self, 'config', {}) + + files_downloaded = 0 + + try: + folder_id = config.get('folder_id') + file_ids = config.get('file_ids', []) + recursive = config.get('recursive', True) + + if file_ids: + if isinstance(file_ids, str): + file_ids = [file_ids] + + for file_id in file_ids: + if self._download_file_to_directory(file_id, local_dir): + files_downloaded += 1 + + elif folder_id: + files_downloaded = self._download_folder_contents(folder_id, local_dir, recursive) + + else: + raise ValueError("No folder_id or file_ids provided for download") + + return { + "files_downloaded": files_downloaded, + "directory_path": local_dir, + "empty_result": files_downloaded == 0, + "source_type": "google_drive", + "config_used": config + } + + except Exception as e: + return { + "files_downloaded": 0, + "directory_path": local_dir, + "empty_result": True, + "error": str(e), + "source_type": "google_drive" + } diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index 026abd76..4d1f34a2 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -3,6 +3,7 @@ from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote from application.parser.remote.github_loader import GitHubLoader +from application.parser.remote.google_drive_loader import GoogleDriveLoader class RemoteCreator: @@ -12,11 +13,12 @@ class RemoteCreator: "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, "github": GitHubLoader, + "google_drive": GoogleDriveLoader, } @classmethod def create_loader(cls, type, *args, **kwargs): loader_class = cls.loaders.get(type.lower()) if not loader_class: - raise ValueError(f"No LLM class found for type {type}") + raise ValueError(f"No loader class found for type {type}") return loader_class(*args, **kwargs) From f82be23ca94f678b6f04c336bc9e37a74b56debd Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 13:33:21 +0530 Subject: [PATCH 05/25] (feat:ingestion) external drive connect --- application/api/user/routes.py | 445 +++++++++++++++++++++++++++++++++ application/api/user/tasks.py | 7 + application/worker.py | 144 +++++++++++ 3 files changed, 596 insertions(+) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 9a2febbc..c6edec6f 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -28,6 +28,7 @@ from application.agents.tools.tool_manager import ToolManager from application.api.user.tasks import ( ingest, + ingest_connector_task, ingest_remote, process_agent_webhook, store_attachment, @@ -877,6 +878,42 @@ class UploadRemote(Resource): source_data = config.get("url") elif data["source"] == "reddit": source_data = config + elif data["source"] == "google_drive": + if "session_token" not in config: + return make_response(jsonify({ + "success": False, + "error": "Missing session_token in Google Drive configuration" + }), 400) + + session_token = config.get("session_token") + + file_ids = config.get("file_ids", []) + if isinstance(file_ids, str): + file_ids = [id.strip() for id in file_ids.split(',') if id.strip()] + elif not isinstance(file_ids, list): + file_ids = [] + + folder_id = config.get("folder_id", "") + if not isinstance(folder_id, str): + folder_id = str(folder_id) if folder_id else "" + + recursive = bool(config.get("recursive", False)) + + clean_config = { + "session_token": session_token, + "file_ids": file_ids, + "folder_id": folder_id, + "recursive": recursive + } + + from application.api.user.tasks import ingest_connector_task + task = ingest_connector_task.delay( + source_config=clean_config, + job_name=data["name"], + user=decoded_token.get("sub"), + source_type="google_drive" + ) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) task = ingest_remote.delay( source_data=source_data, job_name=data["name"], @@ -3936,3 +3973,411 @@ class DirectoryStructure(Resource): return make_response( jsonify({"success": False, "error": str(e)}), 500 ) + + +@user_ns.route("/api/google-drive/auth") +class GoogleDriveAuth(Resource): + @api.doc(description="Get Google Drive OAuth authorization URL") + def get(self): + """Get Google Drive OAuth authorization URL""" + try: + from application.parser.remote.google_auth import GoogleDriveAuth + + auth = GoogleDriveAuth() + + # Generate state parameter for CSRF protection + import uuid + state = str(uuid.uuid4()) + + # Store state in session or database for validation + # For now, we'll include it in the URL and validate on callback + authorization_url = auth.get_authorization_url(state=state) + current_app.logger.info(f"Generated authorization URL: {authorization_url}") + return make_response( + jsonify({ + "success": True, + "authorization_url": authorization_url, + "state": state + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error generating Google Drive auth URL: {e}") + return make_response( + jsonify({"success": False, "error": str(e)}), 500 + ) + + +@user_ns.route("/api/google-drive/callback") +class GoogleDriveCallback(Resource): + @api.doc(description="Handle Google Drive OAuth callback") + def get(self): + """Handle Google Drive OAuth callback""" + try: + from application.parser.remote.google_auth import GoogleDriveAuth + from flask import request + import uuid + + # Get authorization code and state from query parameters + authorization_code = request.args.get('code') + _ = request.args.get('state') # We don't currently use state, but capture it to avoid unused variable warning + error = request.args.get('error') + + if error: + return make_response( + jsonify({"success": False, "error": f"OAuth error: {error}. Please try again and make sure to grant all requested permissions, including offline access."}), 400 + ) + + if not authorization_code: + return make_response( + jsonify({"success": False, "error": "Authorization code not provided. Please complete the authorization process and make sure to grant offline access."}), 400 + ) + + # Exchange code for tokens + try: + auth = GoogleDriveAuth() + token_info = auth.exchange_code_for_tokens(authorization_code) + + # Log detailed information about the token_info we received + current_app.logger.info(f"Token info received from OAuth callback - has refresh_token: {bool(token_info.get('refresh_token'))}, " + f"has access_token: {bool(token_info.get('access_token'))}, " + f"expiry: {token_info.get('expiry')}") + + # Log the full token_info structure (without sensitive data) + safe_token_info = {k: v for k, v in token_info.items() if k not in ['access_token', 'refresh_token', 'client_secret']} + current_app.logger.info(f"Full token info structure: {safe_token_info}") + + # Validate that we got token info + if not token_info: + current_app.logger.error("exchange_code_for_tokens returned None or empty result") + return make_response( + jsonify({"success": False, "error": "Failed to exchange authorization code for tokens. Please try again and make sure to grant all requested permissions, including offline access."}), 400 + ) + + # Validate required fields in token_info + required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] + missing_fields = [field for field in required_fields if not token_info.get(field)] + if missing_fields: + current_app.logger.error(f"Token info missing required fields: {missing_fields}") + return make_response( + jsonify({"success": False, "error": f"Token information incomplete. Missing fields: {missing_fields}. Please try again and make sure to grant all requested permissions."}), 400 + ) + + # Check if refresh_token is present - this is critical for long-term access + if not token_info.get('refresh_token'): + return make_response( + jsonify({ + "success": False, + "error": "OAuth flow did not return a refresh token. This typically happens when offline access wasn't granted. " + "Please reconnect your Google Drive account and ensure you grant offline access when prompted. " + "Make sure to check 'Allow offline access' during the authorization process." + }), 400 + ) + + # Validate required fields in token_info + required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] + missing_fields = [field for field in required_fields if not token_info.get(field)] + if missing_fields: + current_app.logger.error(f"Token info missing required fields: {missing_fields}") + return make_response( + jsonify({"success": False, "error": f"Token info missing required fields: {missing_fields}"}), 400 + ) + + except Exception as e: + current_app.logger.error(f"Error exchanging code for tokens: {e}", exc_info=True) + return make_response( + jsonify({"success": False, "error": f"Failed to exchange authorization code for tokens: {str(e)}"}), 400 + ) + + # Get user information + try: + credentials = auth.create_credentials_from_token_info(token_info) + service = auth.build_drive_service(credentials) + user_info = service.about().get(fields="user").execute() + user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + except Exception as e: + current_app.logger.warning(f"Could not get user info: {e}") + # Try to get user info without building service if we have access token + if token_info.get('access_token'): + try: + import requests + headers = {'Authorization': f'Bearer {token_info["access_token"]}'} + response = requests.get( + 'https://www.googleapis.com/drive/v3/about?fields=user', + headers=headers + ) + if response.status_code == 200: + user_info = response.json() + user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + else: + user_email = 'Connected User' + except Exception as request_error: + current_app.logger.warning(f"Could not get user info via direct request: {request_error}") + user_email = 'Connected User' + else: + user_email = 'Connected User' + + # Generate a session token + session_token = str(uuid.uuid4()) + + # Store token_info in MongoDB + from application.core.mongo_db import MongoDB + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + sessions_collection = db["drive_sessions"] + + # Store only necessary token info, removing sensitive fields + sanitized_token_info = { + "access_token": token_info.get("access_token"), + "refresh_token": token_info.get("refresh_token"), + "token_uri": token_info.get("token_uri"), + "expiry": token_info.get("expiry"), + "scopes": token_info.get("scopes") + } + + # Store the sanitized token info with the session token + sessions_collection.insert_one({ + "session_token": session_token, + "token_info": sanitized_token_info, + "created_at": datetime.datetime.now(datetime.timezone.utc), + "user_email": user_email + }) + + # Return only the session token and user email to the client + return make_response( + jsonify({ + "success": True, + "message": "Google Drive authentication successful", + "session_token": session_token, + "user_email": user_email + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error handling Google Drive callback: {e}") + return make_response( + jsonify({ + "success": False, + "error": f"Failed to complete Google Drive authentication: {str(e)}. Please try again and make sure to grant all requested permissions, including offline access." + }), 500 + ) + + +@user_ns.route("/api/google-drive/refresh") +class GoogleDriveRefresh(Resource): + @api.expect( + api.model( + "GoogleDriveRefreshModel", + { + "refresh_token": fields.String(required=True, description="Refresh token") + } + ) + ) + @api.doc(description="Refresh Google Drive access token") + def post(self): + """Refresh Google Drive access token""" + try: + from application.parser.remote.google_auth import GoogleDriveAuth + + data = request.get_json() + refresh_token = data.get('refresh_token') + + if not refresh_token: + return make_response( + jsonify({"success": False, "error": "Refresh token not provided"}), 400 + ) + + auth = GoogleDriveAuth() + token_info = auth.refresh_access_token(refresh_token) + + return make_response( + jsonify({ + "success": True, + "message": "Token refreshed successfully", + "token_info": token_info + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error refreshing Google Drive token: {e}") + return make_response( + jsonify({ + "success": False, + "error": f"Failed to refresh Google Drive token: {str(e)}. Please reconnect your Google Drive account and make sure to grant offline access." + }), 500 + ) + + +@user_ns.route("/api/google-drive/files") +class GoogleDriveFiles(Resource): + @api.expect( + api.model( + "GoogleDriveFilesModel", + { + "session_token": fields.String(required=True, description="Google Drive session token"), + "folder_id": fields.String(description="Google Drive folder ID to fetch files from. If not provided, fetches from root", required=False), + "limit": fields.Integer(description="Maximum number of files to return", default=50) + } + ) + ) + @api.doc(description="Get list of files from Google Drive") + def post(self): + """Get list of files from Google Drive""" + try: + from application.parser.remote.google_drive_loader import GoogleDriveLoader + + data = request.get_json() + session_token = data.get('session_token') + folder_id = data.get('folder_id') + limit = data.get('limit', 50) + + if not session_token: + return make_response( + jsonify({"success": False, "error": "Session token not provided"}), 400 + ) + + # Create Google Drive loader with session token only + loader = GoogleDriveLoader(session_token) + + # Get files from Google Drive (limit to first N files, metadata only) + files_config = { + 'limit': limit, + 'list_only': True, + 'session_token': session_token, + 'folder_id': folder_id + } + documents = loader.load_data(files_config) + + # Convert documents to file list format + files = [] + for doc in documents[:limit]: + # Use extra_info instead of doc_metadata + metadata = doc.extra_info + files.append({ + 'id': doc.doc_id, + 'name': metadata.get('file_name', 'Unknown File'), + 'type': metadata.get('mime_type', 'unknown'), + 'size': metadata.get('size', 'Unknown'), + 'modifiedTime': metadata.get('modified_time', 'Unknown'), + 'iconUrl': get_file_icon(metadata.get('mime_type', '')) + }) + + return make_response( + jsonify({ + "success": True, + "files": files, + "total": len(files) + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error loading Google Drive files: {e}") + return make_response( + jsonify({ + "success": False, + "error": f"Failed to load files: {str(e)}. Please make sure your Google Drive account is properly connected and you granted offline access during authorization." + }), 500 + ) + +def get_file_icon(mime_type: str) -> str: + """Get appropriate icon for file type""" + if 'pdf' in mime_type: + return '📄' + elif 'word' in mime_type or 'document' in mime_type: + return '📝' + elif 'presentation' in mime_type or 'powerpoint' in mime_type: + return '📊' + elif 'spreadsheet' in mime_type or 'excel' in mime_type: + return '📈' + elif 'text' in mime_type: + return '📄' + elif 'image' in mime_type: + return '🖼️' + else: + return '📄' + +@user_ns.route("/api/google-drive/validate-session") +class GoogleDriveValidateSession(Resource): + @api.expect( + api.model( + "GoogleDriveValidateSessionModel", + { + "session_token": fields.String(required=True, description="Google Drive session token") + } + ) + ) + @api.doc(description="Validate Google Drive session token") + def post(self): + """Validate Google Drive session token and return user info""" + try: + from application.core.mongo_db import MongoDB + from application.parser.remote.google_auth import GoogleDriveAuth + + data = request.get_json() + session_token = data.get('session_token') + + if not session_token: + return make_response( + jsonify({"success": False, "error": "Session token not provided"}), 400 + ) + + # Retrieve session from MongoDB using session token + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + sessions_collection = db["drive_sessions"] + + session = sessions_collection.find_one({"session_token": session_token}) + if not session or "token_info" not in session: + return make_response( + jsonify({"success": False, "error": "Invalid or expired session"}), 401 + ) + + # Get token info and check if it's expired + token_info = session["token_info"] + auth = GoogleDriveAuth() + + # Check if token is expired using our improved method + is_expired = auth.is_token_expired(token_info) + + # Attempt to refresh token if needed + if is_expired and 'refresh_token' in token_info: + try: + current_app.logger.info("Refreshing expired Google Drive token") + refreshed_token_info = auth.refresh_access_token(token_info['refresh_token']) + + # Update token in database + sessions_collection.update_one( + {"session_token": session_token}, + {"$set": {"token_info": refreshed_token_info}} + ) + + # Use the refreshed token info + token_info = refreshed_token_info + except Exception as e: + current_app.logger.error(f"Error refreshing token: {e}", exc_info=True) + return make_response( + jsonify({"success": False, "error": "Session expired and could not be refreshed"}), 401 + ) + + # Return success with user email + return make_response( + jsonify({ + "success": True, + "user_email": session.get("user_email", "Connected User"), + "message": "Session is valid" + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error validating Google Drive session: {e}", exc_info=True) + return make_response( + jsonify({ + "success": False, + "error": f"Failed to validate session: {str(e)}. Please reconnect your Google Drive account and make sure to grant offline access during authorization." + }), 500 + ) diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index 28a78c0d..bfed7f5a 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -47,6 +47,13 @@ def process_agent_webhook(self, agent_id, payload): return resp +@celery.task(bind=True) +def ingest_connector_task(self, source_config, job_name, user, source_type, retriever="classic"): + from application.worker import ingest_connector + resp = ingest_connector(self, job_name, user, source_type, source_config, retriever) + return resp + + @celery.on_after_configure.connect def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task( diff --git a/application/worker.py b/application/worker.py index 7309806d..a9503734 100755 --- a/application/worker.py +++ b/application/worker.py @@ -6,6 +6,7 @@ import os import shutil import string import tempfile +from typing import Any, Dict import zipfile from collections import Counter @@ -835,3 +836,146 @@ def agent_webhook_worker(self, agent_id, payload): f"Webhook processed for agent {agent_id}", extra={"agent_id": agent_id} ) return {"status": "success", "result": result} + + +def ingest_connector( + self, job_name: str, user: str, source_type: str, + source_config: Dict[str, Any], retriever: str = "classic" +) -> Dict[str, Any]: + """ + ingestion for internal knowledge bases(GoogleDrive). + + Args: + job_name: Name of the ingestion job + user: User identifier + source_type: Type of remote source ("google_drive", "dropbox", etc.) + source_config: Configuration specific to the source type + retriever: Type of retriever to use + """ + logging.info(f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}") + self.update_state(state="PROGRESS", meta={"current": 1}) + + with tempfile.TemporaryDirectory() as temp_dir: + try: + # Step 1: Get the appropriate remote loader + logging.info(f"source_config {source_config}") + + if source_type == "google_drive": + session_token = source_config.get("session_token") + if not session_token: + raise ValueError("Google Drive connector requires session_token in source_config") + + from application.parser.remote.google_drive_loader import GoogleDriveLoader + remote_loader = GoogleDriveLoader(session_token) + + # Create a clean config for storage that excludes the session token + api_source_config = { + "file_ids": source_config.get("file_ids", []), + "folder_id": source_config.get("folder_id", ""), + } + + if source_config.get("recursive") is not None: + api_source_config["recursive"] = source_config.get("recursive") + else: + remote_loader = RemoteCreator.create_loader(source_type, source_config) + api_source_config = source_config + + # Step 2: Download files to temp directory + self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) + + # For Google Drive, pass the source_config to download_to_directory + if source_type == "google_drive": + download_info = remote_loader.download_to_directory(temp_dir, source_config) + else: + download_info = remote_loader.download_to_directory(temp_dir) + + if download_info.get("empty_result", False) or not download_info.get("files_downloaded", 0): + logging.warning(f"No files were downloaded from {source_type}") + # Create empty result directly instead of calling a separate method + return { + "name": job_name, + "user": user, + "tokens": 0, + "type": source_type, + "source_config": source_config, + "directory_structure": "{}", + } + + # Step 3: Use SimpleDirectoryReader to process downloaded files + self.update_state(state="PROGRESS", meta={"current": 40, "status": "Processing files"}) + reader = SimpleDirectoryReader( + input_dir=temp_dir, + recursive=True, + required_exts=[ + ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", + ".html", ".mdx", ".json", ".xlsx", ".pptx", ".png", + ".jpg", ".jpeg", + ], + exclude_hidden=True, + file_metadata=metadata_from_filename, + ) + raw_docs = reader.load_data() + directory_structure = getattr(reader, 'directory_structure', {}) + + # Step 4: Process documents (chunking, embedding, etc.) + self.update_state(state="PROGRESS", meta={"current": 60, "status": "Processing documents"}) + + chunker = Chunker( + chunking_strategy="classic_chunk", + max_tokens=MAX_TOKENS, + min_tokens=MIN_TOKENS, + duplicate_headers=False, + ) + raw_docs = chunker.chunk(documents=raw_docs) + + # Preserve source information in document metadata + for doc in raw_docs: + if hasattr(doc, 'extra_info') and doc.extra_info: + source = doc.extra_info.get('source') + if source and os.path.isabs(source): + # Convert absolute path to relative path + doc.extra_info['source'] = os.path.relpath(source, start=temp_dir) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + # Step 5: Store in vector database + id = ObjectId() + vector_store_path = os.path.join(temp_dir, "vector_store") + os.makedirs(vector_store_path, exist_ok=True) + + self.update_state(state="PROGRESS", meta={"current": 80, "status": "Storing documents"}) + embed_and_store_documents(docs, vector_store_path, id, self) + + tokens = count_tokens_docs(docs) + + # Step 6: Upload index files + file_data = { + "user": user, + "name": job_name, + "tokens": tokens, + "retriever": retriever, + "id": str(id), + "type": source_type, + "remote_data": json.dumps(api_source_config), + "directory_structure": json.dumps(directory_structure) + } + + upload_index(vector_store_path, file_data) + + # Ensure we mark the task as complete + self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"}) + + logging.info(f"Remote ingestion completed: {job_name}") + + return { + "user": user, + "name": job_name, + "tokens": tokens, + "type": source_type, + "id": str(id), + "status": "complete" + } + + except Exception as e: + logging.error(f"Error during remote ingestion: {e}", exc_info=True) + raise From 92d6ae54c32c0cdcf509bbeccb343d1932b3cd5c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 13:35:03 +0530 Subject: [PATCH 06/25] (fix:google-oauth) no explicit datetime compare --- application/parser/remote/google_auth.py | 71 ++++-------------------- 1 file changed, 10 insertions(+), 61 deletions(-) diff --git a/application/parser/remote/google_auth.py b/application/parser/remote/google_auth.py index 2b736e69..c64e125e 100644 --- a/application/parser/remote/google_auth.py +++ b/application/parser/remote/google_auth.py @@ -1,5 +1,4 @@ import logging -import time import datetime from typing import Optional, Dict, Any @@ -28,7 +27,9 @@ class GoogleDriveAuth: if not self.client_id or not self.client_secret: raise ValueError("Google OAuth credentials not configured. Please set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET in settings.") - + + + def get_authorization_url(self, state: Optional[str] = None) -> str: """ Generate Google OAuth authorization URL. @@ -99,21 +100,6 @@ class GoogleDriveAuth: credentials = flow.credentials - if credentials.expiry: - try: - expiry = credentials.expiry - if expiry.tzinfo is None: - # If expiry is offset-naive, make it offset-aware - expiry = expiry.replace(tzinfo=datetime.timezone.utc) - - current_time = datetime.datetime.now(datetime.timezone.utc) - time_until_expiry = expiry - current_time - logging.info(f"Token expires in: {time_until_expiry}") - except Exception as e: - logging.warning(f"Error calculating token expiry: {e}") - else: - logging.info("Token has no expiry information") - if not credentials.refresh_token: logging.warning("OAuth flow did not return a refresh_token.") if not credentials.token: @@ -128,27 +114,11 @@ class GoogleDriveAuth: if not credentials.client_secret: credentials.client_secret = self.client_secret - if credentials.expiry: - try: - expiry_dt = credentials.expiry - if expiry_dt.tzinfo is None: # Ensure UTC timezone - expiry_dt = expiry_dt.replace(tzinfo=datetime.timezone.utc) - - current_time = datetime.datetime.now(datetime.timezone.utc) - time_until_expiry = expiry_dt - current_time - logging.info(f"Access token expires in {time_until_expiry}") - except Exception as e: - logging.warning(f"Error calculating token expiry: {e}") - if not credentials.refresh_token: raise ValueError( "No refresh token received. This typically happens when offline access wasn't granted. " ) - expiry_iso = None - if credentials.expiry: - expiry_iso = credentials.expiry.isoformat() - return { 'access_token': credentials.token, 'refresh_token': credentials.refresh_token, @@ -156,7 +126,7 @@ class GoogleDriveAuth: 'client_id': credentials.client_id, 'client_secret': credentials.client_secret, 'scopes': credentials.scopes, - 'expiry': expiry_iso + 'expiry': credentials.expiry.isoformat() if credentials.expiry else None } except Exception as e: @@ -179,18 +149,14 @@ class GoogleDriveAuth: from google.auth.transport.requests import Request credentials.refresh(Request()) - expiry_iso = None - if credentials.expiry: - expiry_iso = credentials.expiry.isoformat() - return { 'access_token': credentials.token, - 'refresh_token': refresh_token, + 'refresh_token': refresh_token, 'token_uri': credentials.token_uri, 'client_id': credentials.client_id, 'client_secret': credentials.client_secret, 'scopes': credentials.scopes, - 'expiry': expiry_iso + 'expiry': credentials.expiry.isoformat() if credentials.expiry else None } except Exception as e: logging.error(f"Error refreshing access token: {e}", exc_info=True) @@ -246,28 +212,11 @@ class GoogleDriveAuth: def is_token_expired(self, token_info): if 'expiry' in token_info and token_info['expiry']: try: - import datetime as dt from dateutil import parser - - expiry_input = token_info['expiry'] - - if isinstance(expiry_input, str): - # Parse ISO format string - expiry_dt = parser.parse(expiry_input) - elif isinstance(expiry_input, dt.datetime): - expiry_dt = expiry_input - else: - logging.warning(f"Unexpected expiry format: {type(expiry_input)}") - return True - - # Ensure UTC timezone - if expiry_dt.tzinfo is None: - expiry_dt = expiry_dt.replace(tzinfo=dt.timezone.utc) - - current_time = dt.datetime.now(dt.timezone.utc) - - return current_time >= expiry_dt - dt.timedelta(seconds=60) - + # Google Drive provides timezone-aware ISO8601 dates + expiry_dt = parser.parse(token_info['expiry']) + current_time = datetime.datetime.now(datetime.timezone.utc) + return current_time >= expiry_dt - datetime.timedelta(seconds=60) except Exception: return True From e7430f0fbc03c59d909e58e38744c8858f3f4369 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 13:36:32 +0530 Subject: [PATCH 07/25] (feat:googleDrive,fe) file tree --- frontend/src/upload/Upload.tsx | 96 ++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 11 deletions(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 75749f29..f17c7c95 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -58,6 +58,8 @@ function Upload({ const [isAuthenticating, setIsAuthenticating] = useState(false); const [userEmail, setUserEmail] = useState(''); const [authError, setAuthError] = useState(''); + const [currentFolderId, setCurrentFolderId] = useState(null); + const [folderPath, setFolderPath] = useState>([{id: null, name: 'My Drive'}]); const renderFormFields = () => { const schema = IngestorFormSchemas[ingestor.type]; @@ -540,7 +542,7 @@ function Upload({ if (validateData.success) { setUserEmail(validateData.user_email || 'Connected User'); - loadGoogleDriveFiles(sessionToken); + loadGoogleDriveFiles(sessionToken, null); } else { localStorage.removeItem('google_drive_session_token'); setIsGoogleDriveConnected(false); @@ -612,7 +614,7 @@ function Upload({ window.removeEventListener('message', handleAuthMessage); - loadGoogleDriveFiles(event.data.session_token); + loadGoogleDriveFiles(event.data.session_token, null); } else if (event.data.type === 'google_drive_auth_error') { console.error('OAuth error received:', event.data); setAuthError(event.data.error || 'Authentication failed. Please make sure to grant all requested permissions, including offline access. You may need to revoke previous access and re-authorize.'); @@ -641,21 +643,26 @@ function Upload({ } }; - const loadGoogleDriveFiles = async (sessionToken: string) => { + const loadGoogleDriveFiles = async (sessionToken: string, folderId?: string | null) => { setIsLoadingFiles(true); try { const apiHost = import.meta.env.VITE_API_HOST; + const requestBody: any = { + session_token: sessionToken, + limit: 50 + }; + if (folderId) { + requestBody.folder_id = folderId; + } + const filesResponse = await fetch(`${apiHost}/api/google-drive/files`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` }, - body: JSON.stringify({ - session_token: sessionToken, - limit: 50 - }) + body: JSON.stringify(requestBody) }); if (!filesResponse.ok) { @@ -700,6 +707,15 @@ function Upload({ size: '5.8 MB', modifiedTime: '2024-01-13', iconUrl: '�' + }, + { + id: 'folder1', + name: 'Documents', + type: 'application/vnd.google-apps.folder', + size: '0 bytes', + modifiedTime: '2024-01-13', + iconUrl: '📁', + isFolder: true } ]; setGoogleDriveFiles(mockFiles); @@ -719,6 +735,27 @@ function Upload({ }); }; + const handleFolderClick = (folderId: string, folderName: string) => { + const sessionToken = localStorage.getItem('google_drive_session_token'); + if (sessionToken) { + setCurrentFolderId(folderId); + setFolderPath(prev => [...prev, {id: folderId, name: folderName}]); + loadGoogleDriveFiles(sessionToken, folderId); + } + }; + + const navigateBack = (index: number) => { + const sessionToken = localStorage.getItem('google_drive_session_token'); + if (sessionToken) { + const newPath = folderPath.slice(0, index + 1); + const targetFolderId = newPath[newPath.length - 1]?.id; + + setCurrentFolderId(targetFolderId); + setFolderPath(newPath); + loadGoogleDriveFiles(sessionToken, targetFolderId); + } + }; + const handleSelectAll = () => { if (selectedFiles.length === googleDriveFiles.length) { setSelectedFiles([]); @@ -1002,6 +1039,22 @@ function Upload({ {/* File Browser */}
+ {/* Breadcrumb navigation */} +
+ {folderPath.map((path, index) => ( +
+ {index > 0 && /} + +
+ ))} +
+

Select Files from Google Drive @@ -1039,10 +1092,9 @@ function Upload({ {googleDriveFiles.map((file) => (
handleFileSelect(file.id)} >
@@ -1053,9 +1105,31 @@ function Upload({ className="h-4 w-4 text-blue-600 rounded border-gray-300 focus:ring-blue-500" />
-
{file.iconUrl}
+ {file.type === 'application/vnd.google-apps.folder' || file.isFolder ? ( +
handleFolderClick(file.id, file.name)} + > + Folder +
+ ) : ( +
+ File +
+ )}
-

+

{ + if (file.type === 'application/vnd.google-apps.folder' || file.isFolder) { + handleFolderClick(file.id, file.name); + } + }} + > {file.name}

From 2410bd8654f7af073b2e32457bbe7288697aa105 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 22 Aug 2025 19:07:52 +0530 Subject: [PATCH 08/25] (fix:driveLoader) folder ingesting --- application/api/user/routes.py | 27 ++-- .../parser/remote/google_drive_loader.py | 125 ++++++++++++------ application/worker.py | 11 +- frontend/src/upload/Upload.tsx | 34 ++--- 4 files changed, 119 insertions(+), 78 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index c6edec6f..dfe9a8aa 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -887,28 +887,33 @@ class UploadRemote(Resource): session_token = config.get("session_token") + # Process file_ids file_ids = config.get("file_ids", []) if isinstance(file_ids, str): file_ids = [id.strip() for id in file_ids.split(',') if id.strip()] elif not isinstance(file_ids, list): file_ids = [] - folder_id = config.get("folder_id", "") - if not isinstance(folder_id, str): - folder_id = str(folder_id) if folder_id else "" + + folder_ids = config.get("folder_ids", []) + if isinstance(folder_ids, str): + folder_ids = [id.strip() for id in folder_ids.split(',') if id.strip()] + elif not isinstance(folder_ids, list): + folder_ids = [] - recursive = bool(config.get("recursive", False)) + # Ensure at least one file or folder is selected + if not file_ids and not folder_ids: + return make_response(jsonify({ + "success": False, + "error": "No files or folders selected" + }), 400) - clean_config = { - "session_token": session_token, - "file_ids": file_ids, - "folder_id": folder_id, - "recursive": recursive - } + config["file_ids"] = file_ids + config["folder_ids"] = folder_ids from application.api.user.tasks import ingest_connector_task task = ingest_connector_task.delay( - source_config=clean_config, + source_config=config, job_name=data["name"], user=decoded_token.get("sub"), source_type="google_drive" diff --git a/application/parser/remote/google_drive_loader.py b/application/parser/remote/google_drive_loader.py index b2be6c4c..a5d5cc9f 100644 --- a/application/parser/remote/google_drive_loader.py +++ b/application/parser/remote/google_drive_loader.py @@ -404,37 +404,58 @@ class GoogleDriveLoader(BaseRemote): def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: files_downloaded = 0 - query = f"'{folder_id}' in parents and trashed=false" - - page_token = None - while True: - results = self.service.files().list( - q=query, - fields='nextPageToken,files(id,name,mimeType)', - pageToken=page_token - ).execute() - - files = results.get('files', []) - - for file_metadata in files: - if file_metadata['mimeType'] == 'application/vnd.google-apps.folder': - if recursive: - subfolder_path = os.path.join(local_dir, file_metadata['name']) - os.makedirs(subfolder_path, exist_ok=True) - files_downloaded += self._download_folder_recursive( - file_metadata['id'], - subfolder_path, - recursive - ) - else: - if self._download_single_file(file_metadata['id'], local_dir): - files_downloaded += 1 - - page_token = results.get('nextPageToken') - if not page_token: - break - - return files_downloaded + try: + os.makedirs(local_dir, exist_ok=True) + + query = f"'{folder_id}' in parents and trashed=false" + page_token = None + + while True: + results = self.service.files().list( + q=query, + fields='nextPageToken, files(id, name, mimeType)', + pageToken=page_token, + pageSize=1000 + ).execute() + + items = results.get('files', []) + logging.info(f"Found {len(items)} items in folder {folder_id}") + + for item in items: + item_name = item['name'] + item_id = item['id'] + mime_type = item['mimeType'] + + if mime_type == 'application/vnd.google-apps.folder': + if recursive: + # Create subfolder and recurse + subfolder_path = os.path.join(local_dir, item_name) + os.makedirs(subfolder_path, exist_ok=True) + subfolder_files = self._download_folder_recursive( + item_id, + subfolder_path, + recursive + ) + files_downloaded += subfolder_files + logging.info(f"Downloaded {subfolder_files} files from subfolder {item_name}") + else: + # Download file + success = self._download_single_file(item_id, local_dir) + if success: + files_downloaded += 1 + logging.info(f"Downloaded file: {item_name}") + else: + logging.warning(f"Failed to download file: {item_name}") + + page_token = results.get('nextPageToken') + if not page_token: + break + + return files_downloaded + + except Exception as e: + logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True) + return files_downloaded def _get_extension_for_mime_type(self, mime_type: str) -> str: extensions = { @@ -461,14 +482,15 @@ class GoogleDriveLoader(BaseRemote): source_config = {} config = source_config if source_config else getattr(self, 'config', {}) - files_downloaded = 0 try: - folder_id = config.get('folder_id') + folder_ids = config.get('folder_ids', []) file_ids = config.get('file_ids', []) recursive = config.get('recursive', True) + self._ensure_service() + if file_ids: if isinstance(file_ids, str): file_ids = [file_ids] @@ -477,11 +499,33 @@ class GoogleDriveLoader(BaseRemote): if self._download_file_to_directory(file_id, local_dir): files_downloaded += 1 - elif folder_id: - files_downloaded = self._download_folder_contents(folder_id, local_dir, recursive) + # Process folders + if folder_ids: + if isinstance(folder_ids, str): + folder_ids = [folder_ids] - else: - raise ValueError("No folder_id or file_ids provided for download") + for folder_id in folder_ids: + try: + folder_metadata = self.service.files().get( + fileId=folder_id, + fields='name' + ).execute() + folder_name = folder_metadata.get('name', '') + folder_path = os.path.join(local_dir, folder_name) + os.makedirs(folder_path, exist_ok=True) + + folder_files = self._download_folder_recursive( + folder_id, + folder_path, + recursive + ) + files_downloaded += folder_files + logging.info(f"Downloaded {folder_files} files from folder {folder_name}") + except Exception as e: + logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) + + if not file_ids and not folder_ids: + raise ValueError("No folder_ids or file_ids provided for download") return { "files_downloaded": files_downloaded, @@ -493,9 +537,10 @@ class GoogleDriveLoader(BaseRemote): except Exception as e: return { - "files_downloaded": 0, + "files_downloaded": files_downloaded, "directory_path": local_dir, "empty_result": True, - "error": str(e), - "source_type": "google_drive" + "source_type": "google_drive", + "config_used": config, + "error": str(e) } diff --git a/application/worker.py b/application/worker.py index a9503734..99dc2635 100755 --- a/application/worker.py +++ b/application/worker.py @@ -854,11 +854,11 @@ def ingest_connector( """ logging.info(f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}") self.update_state(state="PROGRESS", meta={"current": 1}) - + with tempfile.TemporaryDirectory() as temp_dir: try: - # Step 1: Get the appropriate remote loader - logging.info(f"source_config {source_config}") + # Step 1: Initialize the appropriate loader + self.update_state(state="PROGRESS", meta={"current": 10, "status": "Initializing connector"}) if source_type == "google_drive": session_token = source_config.get("session_token") @@ -871,11 +871,10 @@ def ingest_connector( # Create a clean config for storage that excludes the session token api_source_config = { "file_ids": source_config.get("file_ids", []), - "folder_id": source_config.get("folder_id", ""), + "folder_ids": source_config.get("folder_ids", []), + "recursive": source_config.get("recursive", True) } - if source_config.get("recursive") is not None: - api_source_config["recursive"] = source_config.get("recursive") else: remote_loader = RemoteCreator.create_loader(source_type, source_config) api_source_config = source_config diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index f17c7c95..c2cf87ec 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -447,32 +447,24 @@ function Upload({ if (ingestor.type === 'google_drive') { const sessionToken = localStorage.getItem('google_drive_session_token'); + const selectedItems = googleDriveFiles.filter(file => selectedFiles.includes(file.id)); + const selectedFolderIds = selectedItems + .filter(item => item.type === 'application/vnd.google-apps.folder' || item.isFolder) + .map(folder => folder.id); + + const selectedFileIds = selectedItems + .filter(item => item.type !== 'application/vnd.google-apps.folder' && !item.isFolder) + .map(file => file.id); + configData = { - file_ids: selectedFiles, + file_ids: selectedFileIds, + folder_ids: selectedFolderIds, recursive: ingestor.config.recursive, session_token: sessionToken || null }; } else { - const defaultConfig = IngestorDefaultConfigs[ingestor.type].config; - const mergedConfig = { ...defaultConfig, ...ingestor.config }; - configData = Object.entries(mergedConfig).reduce( - (acc, [key, value]) => { - const field = IngestorFormSchemas[ingestor.type].find( - (f) => f.name === key, - ); - // Include the field if: - // 1. It's required, or - // 2. It's optional and has a non-empty value - if ( - field?.required || - (value !== undefined && value !== null && value !== '') - ) { - acc[key] = value; - } - return acc; - }, - {} as Record, - ); + + configData = { ...ingestor.config }; } formData.append('data', JSON.stringify(configData)); From 15a9e97a1ebd595d8be7397df65e2a95a963d605 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 26 Aug 2025 00:56:39 +0530 Subject: [PATCH 09/25] (feat:ingest_connectors) spread config params --- application/api/user/routes.py | 8 +++-- application/api/user/tasks.py | 24 +++++++++++++-- application/worker.py | 55 +++++++++++++++++++++++----------- 3 files changed, 66 insertions(+), 21 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 2a7e9119..ae696952 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -913,10 +913,14 @@ class UploadRemote(Resource): from application.api.user.tasks import ingest_connector_task task = ingest_connector_task.delay( - source_config=config, job_name=data["name"], user=decoded_token.get("sub"), - source_type="google_drive" + source_type="google_drive", + session_token=session_token, + file_ids=file_ids, + folder_ids=folder_ids, + recursive=config.get("recursive", False), + retriever=config.get("retriever", "classic") ) return make_response(jsonify({"success": True, "task_id": task.id}), 200) task = ingest_remote.delay( diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index bfed7f5a..833edbff 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -48,9 +48,29 @@ def process_agent_webhook(self, agent_id, payload): @celery.task(bind=True) -def ingest_connector_task(self, source_config, job_name, user, source_type, retriever="classic"): +def ingest_connector_task( + self, + job_name, + user, + source_type, + session_token=None, + file_ids=None, + folder_ids=None, + recursive=True, + retriever="classic" +): from application.worker import ingest_connector - resp = ingest_connector(self, job_name, user, source_type, source_config, retriever) + resp = ingest_connector( + self, + job_name, + user, + source_type, + session_token=session_token, + file_ids=file_ids, + folder_ids=folder_ids, + recursive=recursive, + retriever=retriever + ) return resp diff --git a/application/worker.py b/application/worker.py index 99dc2635..fe386a2d 100755 --- a/application/worker.py +++ b/application/worker.py @@ -839,17 +839,27 @@ def agent_webhook_worker(self, agent_id, payload): def ingest_connector( - self, job_name: str, user: str, source_type: str, - source_config: Dict[str, Any], retriever: str = "classic" + self, + job_name: str, + user: str, + source_type: str, + session_token=None, + file_ids=None, + folder_ids=None, + recursive=True, + retriever: str = "classic" ) -> Dict[str, Any]: """ - ingestion for internal knowledge bases(GoogleDrive). + Ingestion for internal knowledge bases (GoogleDrive, etc.). Args: job_name: Name of the ingestion job user: User identifier source_type: Type of remote source ("google_drive", "dropbox", etc.) - source_config: Configuration specific to the source type + session_token: Authentication token for the service + file_ids: List of file IDs to download + folder_ids: List of folder IDs to download + recursive: Whether to recursively download folders retriever: Type of retriever to use """ logging.info(f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}") @@ -861,31 +871,42 @@ def ingest_connector( self.update_state(state="PROGRESS", meta={"current": 10, "status": "Initializing connector"}) if source_type == "google_drive": - session_token = source_config.get("session_token") if not session_token: - raise ValueError("Google Drive connector requires session_token in source_config") + raise ValueError("Google Drive connector requires session_token") from application.parser.remote.google_drive_loader import GoogleDriveLoader remote_loader = GoogleDriveLoader(session_token) # Create a clean config for storage that excludes the session token api_source_config = { - "file_ids": source_config.get("file_ids", []), - "folder_ids": source_config.get("folder_ids", []), - "recursive": source_config.get("recursive", True) + "file_ids": file_ids or [], + "folder_ids": folder_ids or [], + "recursive": recursive } + # Step 2: Download files to temp directory + self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) + download_info = remote_loader.download_to_directory( + temp_dir, + { + "file_ids": file_ids or [], + "folder_ids": folder_ids or [], + "recursive": recursive + } + ) else: + # For other connectors, maintain backward compatibility + source_config = { + "session_token": session_token + } + if file_ids: + source_config["file_ids"] = file_ids + if folder_ids: + source_config["folder_ids"] = folder_ids + source_config["recursive"] = recursive + remote_loader = RemoteCreator.create_loader(source_type, source_config) api_source_config = source_config - - # Step 2: Download files to temp directory - self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) - - # For Google Drive, pass the source_config to download_to_directory - if source_type == "google_drive": - download_info = remote_loader.download_to_directory(temp_dir, source_config) - else: download_info = remote_loader.download_to_directory(temp_dir) if download_info.get("empty_result", False) or not download_info.get("files_downloaded", 0): From f09f1433a98b94e633bdfdb1298161f76777d2c1 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 26 Aug 2025 01:38:36 +0530 Subject: [PATCH 10/25] (feat:connectors) separate layer --- application/api/user/routes.py | 10 ++-- application/parser/connectors/__init__.py | 11 ++++ .../parser/connectors/connector_creator.py | 57 +++++++++++++++++++ .../connectors/google_drive/__init__.py | 10 ++++ .../google_drive/auth.py} | 0 .../google_drive/loader.py} | 22 +++---- application/parser/remote/remote_creator.py | 12 +++- application/worker.py | 36 +++++++----- 8 files changed, 125 insertions(+), 33 deletions(-) create mode 100644 application/parser/connectors/__init__.py create mode 100644 application/parser/connectors/connector_creator.py create mode 100644 application/parser/connectors/google_drive/__init__.py rename application/parser/{remote/google_auth.py => connectors/google_drive/auth.py} (100%) rename application/parser/{remote/google_drive_loader.py => connectors/google_drive/loader.py} (98%) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index ae696952..db371d0c 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3995,7 +3995,7 @@ class GoogleDriveAuth(Resource): def get(self): """Get Google Drive OAuth authorization URL""" try: - from application.parser.remote.google_auth import GoogleDriveAuth + from application.parser.connectors.google_drive.auth import GoogleDriveAuth auth = GoogleDriveAuth() @@ -4029,7 +4029,7 @@ class GoogleDriveCallback(Resource): def get(self): """Handle Google Drive OAuth callback""" try: - from application.parser.remote.google_auth import GoogleDriveAuth + from application.parser.connectors.google_drive.auth import GoogleDriveAuth from flask import request import uuid @@ -4193,7 +4193,7 @@ class GoogleDriveRefresh(Resource): def post(self): """Refresh Google Drive access token""" try: - from application.parser.remote.google_auth import GoogleDriveAuth + from application.parser.connectors.google_drive.auth import GoogleDriveAuth data = request.get_json() refresh_token = data.get('refresh_token') @@ -4241,7 +4241,7 @@ class GoogleDriveFiles(Resource): def post(self): """Get list of files from Google Drive""" try: - from application.parser.remote.google_drive_loader import GoogleDriveLoader + from application.parser.connectors.google_drive.loader import GoogleDriveLoader data = request.get_json() session_token = data.get('session_token') @@ -4329,7 +4329,7 @@ class GoogleDriveValidateSession(Resource): """Validate Google Drive session token and return user info""" try: from application.core.mongo_db import MongoDB - from application.parser.remote.google_auth import GoogleDriveAuth + from application.parser.connectors.google_drive.auth import GoogleDriveAuth data = request.get_json() session_token = data.get('session_token') diff --git a/application/parser/connectors/__init__.py b/application/parser/connectors/__init__.py new file mode 100644 index 00000000..ee1af121 --- /dev/null +++ b/application/parser/connectors/__init__.py @@ -0,0 +1,11 @@ +""" +External knowledge base connectors for DocsGPT. + +This module contains connectors for external knowledge bases and document storage systems +that require authentication and specialized handling, separate from simple web scrapers. +""" + +from .connector_creator import ConnectorCreator +from .google_drive import GoogleDriveAuth, GoogleDriveLoader + +__all__ = ['ConnectorCreator', 'GoogleDriveAuth', 'GoogleDriveLoader'] diff --git a/application/parser/connectors/connector_creator.py b/application/parser/connectors/connector_creator.py new file mode 100644 index 00000000..cefba7b4 --- /dev/null +++ b/application/parser/connectors/connector_creator.py @@ -0,0 +1,57 @@ +from application.parser.connectors.google_drive.loader import GoogleDriveLoader + + +class ConnectorCreator: + """ + Factory class for creating external knowledge base connectors. + + These are different from remote loaders as they typically require + authentication and connect to external document storage systems. + """ + + connectors = { + "google_drive": GoogleDriveLoader, + } + + @classmethod + def create_connector(cls, connector_type, *args, **kwargs): + """ + Create a connector instance for the specified type. + + Args: + connector_type: Type of connector to create (e.g., 'google_drive') + *args, **kwargs: Arguments to pass to the connector constructor + + Returns: + Connector instance + + Raises: + ValueError: If connector type is not supported + """ + connector_class = cls.connectors.get(connector_type.lower()) + if not connector_class: + raise ValueError(f"No connector class found for type {connector_type}") + return connector_class(*args, **kwargs) + + @classmethod + def get_supported_connectors(cls): + """ + Get list of supported connector types. + + Returns: + List of supported connector type strings + """ + return list(cls.connectors.keys()) + + @classmethod + def is_supported(cls, connector_type): + """ + Check if a connector type is supported. + + Args: + connector_type: Type of connector to check + + Returns: + True if supported, False otherwise + """ + return connector_type.lower() in cls.connectors diff --git a/application/parser/connectors/google_drive/__init__.py b/application/parser/connectors/google_drive/__init__.py new file mode 100644 index 00000000..18abeec1 --- /dev/null +++ b/application/parser/connectors/google_drive/__init__.py @@ -0,0 +1,10 @@ +""" +Google Drive connector for DocsGPT. + +This module provides authentication and document loading capabilities for Google Drive. +""" + +from .auth import GoogleDriveAuth +from .loader import GoogleDriveLoader + +__all__ = ['GoogleDriveAuth', 'GoogleDriveLoader'] diff --git a/application/parser/remote/google_auth.py b/application/parser/connectors/google_drive/auth.py similarity index 100% rename from application/parser/remote/google_auth.py rename to application/parser/connectors/google_drive/auth.py diff --git a/application/parser/remote/google_drive_loader.py b/application/parser/connectors/google_drive/loader.py similarity index 98% rename from application/parser/remote/google_drive_loader.py rename to application/parser/connectors/google_drive/loader.py index a5d5cc9f..d782649c 100644 --- a/application/parser/remote/google_drive_loader.py +++ b/application/parser/connectors/google_drive/loader.py @@ -12,7 +12,7 @@ from googleapiclient.http import MediaIoBaseDownload from googleapiclient.errors import HttpError from application.parser.remote.base import BaseRemote -from application.parser.remote.google_auth import GoogleDriveAuth +from application.parser.connectors.google_drive.auth import GoogleDriveAuth from application.parser.schema.base import Document @@ -329,7 +329,7 @@ class GoogleDriveLoader(BaseRemote): if e.resp.status in [401, 403]: logging.error(f"Authentication error downloading file {file_id}") - + if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: logging.info(f"Attempting to refresh credentials for file {file_id}") try: @@ -406,10 +406,10 @@ class GoogleDriveLoader(BaseRemote): files_downloaded = 0 try: os.makedirs(local_dir, exist_ok=True) - + query = f"'{folder_id}' in parents and trashed=false" page_token = None - + while True: results = self.service.files().list( q=query, @@ -417,15 +417,15 @@ class GoogleDriveLoader(BaseRemote): pageToken=page_token, pageSize=1000 ).execute() - + items = results.get('files', []) logging.info(f"Found {len(items)} items in folder {folder_id}") - + for item in items: item_name = item['name'] item_id = item['id'] mime_type = item['mimeType'] - + if mime_type == 'application/vnd.google-apps.folder': if recursive: # Create subfolder and recurse @@ -446,13 +446,13 @@ class GoogleDriveLoader(BaseRemote): logging.info(f"Downloaded file: {item_name}") else: logging.warning(f"Failed to download file: {item_name}") - + page_token = results.get('nextPageToken') if not page_token: break - + return files_downloaded - + except Exception as e: logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True) return files_downloaded @@ -513,7 +513,7 @@ class GoogleDriveLoader(BaseRemote): folder_name = folder_metadata.get('name', '') folder_path = os.path.join(local_dir, folder_name) os.makedirs(folder_path, exist_ok=True) - + folder_files = self._download_folder_recursive( folder_id, folder_path, diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index 4d1f34a2..a47b186a 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -3,17 +3,25 @@ from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote from application.parser.remote.github_loader import GitHubLoader -from application.parser.remote.google_drive_loader import GoogleDriveLoader class RemoteCreator: + """ + Factory class for creating remote content loaders. + + These loaders fetch content from remote web sources like URLs, + sitemaps, web crawlers, social media platforms, etc. + + For external knowledge base connectors (like Google Drive), + use ConnectorCreator instead. + """ + loaders = { "url": WebLoader, "sitemap": SitemapLoader, "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, "github": GitHubLoader, - "google_drive": GoogleDriveLoader, } @classmethod diff --git a/application/worker.py b/application/worker.py index fe386a2d..719ebccc 100755 --- a/application/worker.py +++ b/application/worker.py @@ -874,8 +874,8 @@ def ingest_connector( if not session_token: raise ValueError("Google Drive connector requires session_token") - from application.parser.remote.google_drive_loader import GoogleDriveLoader - remote_loader = GoogleDriveLoader(session_token) + from application.parser.connectors.connector_creator import ConnectorCreator + remote_loader = ConnectorCreator.create_connector("google_drive", session_token) # Create a clean config for storage that excludes the session token api_source_config = { @@ -895,19 +895,25 @@ def ingest_connector( } ) else: - # For other connectors, maintain backward compatibility - source_config = { - "session_token": session_token + # For other external knowledge base connectors (future: dropbox, onedrive, etc.) + from application.parser.connectors.connector_creator import ConnectorCreator + + if not ConnectorCreator.is_supported(source_type): + raise ValueError(f"Unsupported connector type: {source_type}. Supported types: {ConnectorCreator.get_supported_connectors()}") + + # Create connector with session token and other parameters + remote_loader = ConnectorCreator.create_connector(source_type, session_token) + + api_source_config = { + "file_ids": file_ids or [], + "folder_ids": folder_ids or [], + "recursive": recursive } - if file_ids: - source_config["file_ids"] = file_ids - if folder_ids: - source_config["folder_ids"] = folder_ids - source_config["recursive"] = recursive - - remote_loader = RemoteCreator.create_loader(source_type, source_config) - api_source_config = source_config - download_info = remote_loader.download_to_directory(temp_dir) + + download_info = remote_loader.download_to_directory( + temp_dir, + api_source_config + ) if download_info.get("empty_result", False) or not download_info.get("files_downloaded", 0): logging.warning(f"No files were downloaded from {source_type}") @@ -917,7 +923,7 @@ def ingest_connector( "user": user, "tokens": 0, "type": source_type, - "source_config": source_config, + "source_config": api_source_config, "directory_structure": "{}", } From 578c68205a581d180695c655987d84c0910e113c Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 26 Aug 2025 02:46:36 +0530 Subject: [PATCH 11/25] (feat:connectors) abstracting auth, base class --- application/api/user/routes.py | 22 +-- application/parser/connectors/__init__.py | 9 +- application/parser/connectors/base.py | 129 ++++++++++++++++++ .../parser/connectors/connector_creator.py | 42 ++++-- .../parser/connectors/google_drive/auth.py | 21 +-- .../parser/connectors/google_drive/loader.py | 23 +--- 6 files changed, 185 insertions(+), 61 deletions(-) create mode 100644 application/parser/connectors/base.py diff --git a/application/api/user/routes.py b/application/api/user/routes.py index db371d0c..a016155b 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3995,9 +3995,9 @@ class GoogleDriveAuth(Resource): def get(self): """Get Google Drive OAuth authorization URL""" try: - from application.parser.connectors.google_drive.auth import GoogleDriveAuth + from application.parser.connectors.connector_creator import ConnectorCreator - auth = GoogleDriveAuth() + auth = ConnectorCreator.create_auth("google_drive") # Generate state parameter for CSRF protection import uuid @@ -4029,7 +4029,7 @@ class GoogleDriveCallback(Resource): def get(self): """Handle Google Drive OAuth callback""" try: - from application.parser.connectors.google_drive.auth import GoogleDriveAuth + from application.parser.connectors.connector_creator import ConnectorCreator from flask import request import uuid @@ -4050,7 +4050,7 @@ class GoogleDriveCallback(Resource): # Exchange code for tokens try: - auth = GoogleDriveAuth() + auth = ConnectorCreator.create_auth("google_drive") token_info = auth.exchange_code_for_tokens(authorization_code) # Log detailed information about the token_info we received @@ -4193,7 +4193,7 @@ class GoogleDriveRefresh(Resource): def post(self): """Refresh Google Drive access token""" try: - from application.parser.connectors.google_drive.auth import GoogleDriveAuth + from application.parser.connectors.connector_creator import ConnectorCreator data = request.get_json() refresh_token = data.get('refresh_token') @@ -4203,7 +4203,7 @@ class GoogleDriveRefresh(Resource): jsonify({"success": False, "error": "Refresh token not provided"}), 400 ) - auth = GoogleDriveAuth() + auth = ConnectorCreator.create_auth("google_drive") token_info = auth.refresh_access_token(refresh_token) return make_response( @@ -4241,7 +4241,7 @@ class GoogleDriveFiles(Resource): def post(self): """Get list of files from Google Drive""" try: - from application.parser.connectors.google_drive.loader import GoogleDriveLoader + from application.parser.connectors.connector_creator import ConnectorCreator data = request.get_json() session_token = data.get('session_token') @@ -4254,7 +4254,7 @@ class GoogleDriveFiles(Resource): ) # Create Google Drive loader with session token only - loader = GoogleDriveLoader(session_token) + loader = ConnectorCreator.create_connector("google_drive", session_token) # Get files from Google Drive (limit to first N files, metadata only) files_config = { @@ -4329,7 +4329,7 @@ class GoogleDriveValidateSession(Resource): """Validate Google Drive session token and return user info""" try: from application.core.mongo_db import MongoDB - from application.parser.connectors.google_drive.auth import GoogleDriveAuth + from application.parser.connectors.connector_creator import ConnectorCreator data = request.get_json() session_token = data.get('session_token') @@ -4352,8 +4352,8 @@ class GoogleDriveValidateSession(Resource): # Get token info and check if it's expired token_info = session["token_info"] - auth = GoogleDriveAuth() - + auth = ConnectorCreator.create_auth("google_drive") + # Check if token is expired using our improved method is_expired = auth.is_token_expired(token_info) diff --git a/application/parser/connectors/__init__.py b/application/parser/connectors/__init__.py index ee1af121..c9add3d7 100644 --- a/application/parser/connectors/__init__.py +++ b/application/parser/connectors/__init__.py @@ -5,7 +5,14 @@ This module contains connectors for external knowledge bases and document storag that require authentication and specialized handling, separate from simple web scrapers. """ +from .base import BaseConnectorAuth, BaseConnectorLoader from .connector_creator import ConnectorCreator from .google_drive import GoogleDriveAuth, GoogleDriveLoader -__all__ = ['ConnectorCreator', 'GoogleDriveAuth', 'GoogleDriveLoader'] +__all__ = [ + 'BaseConnectorAuth', + 'BaseConnectorLoader', + 'ConnectorCreator', + 'GoogleDriveAuth', + 'GoogleDriveLoader' +] diff --git a/application/parser/connectors/base.py b/application/parser/connectors/base.py new file mode 100644 index 00000000..dfb6de87 --- /dev/null +++ b/application/parser/connectors/base.py @@ -0,0 +1,129 @@ +""" +Base classes for external knowledge base connectors. + +This module provides minimal abstract base classes that define the essential +interface for external knowledge base connectors. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +from application.parser.schema.base import Document + + +class BaseConnectorAuth(ABC): + """ + Abstract base class for connector authentication. + + Defines the minimal interface that all connector authentication + implementations must follow. + """ + + @abstractmethod + def get_authorization_url(self, state: Optional[str] = None) -> str: + """ + Generate authorization URL for OAuth flows. + + Args: + state: Optional state parameter for CSRF protection + + Returns: + Authorization URL + """ + pass + + @abstractmethod + def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: + """ + Exchange authorization code for access tokens. + + Args: + authorization_code: Authorization code from OAuth callback + + Returns: + Dictionary containing token information + """ + pass + + @abstractmethod + def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: + """ + Refresh an expired access token. + + Args: + refresh_token: Refresh token + + Returns: + Dictionary containing refreshed token information + """ + pass + + @abstractmethod + def is_token_expired(self, token_info: Dict[str, Any]) -> bool: + """ + Check if a token is expired. + + Args: + token_info: Token information dictionary + + Returns: + True if token is expired, False otherwise + """ + pass + + +class BaseConnectorLoader(ABC): + """ + Abstract base class for connector loaders. + + Defines the minimal interface that all connector loader + implementations must follow. + """ + + @abstractmethod + def __init__(self, session_token: str): + """ + Initialize the connector loader. + + Args: + session_token: Authentication session token + """ + pass + + @abstractmethod + def load_data(self, inputs: Dict[str, Any]) -> List[Document]: + """ + Load documents from the external knowledge base. + + Args: + inputs: Configuration dictionary containing: + - file_ids: Optional list of specific file IDs to load + - folder_ids: Optional list of folder IDs to browse/download + - limit: Maximum number of items to return + - list_only: If True, return metadata without content + - recursive: Whether to recursively process folders + + Returns: + List of Document objects + """ + pass + + @abstractmethod + def download_to_directory(self, local_dir: str, source_config: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Download files/folders to a local directory. + + Args: + local_dir: Local directory path to download files to + source_config: Configuration for what to download + + Returns: + Dictionary containing download results: + - files_downloaded: Number of files downloaded + - directory_path: Path where files were downloaded + - empty_result: Whether no files were downloaded + - source_type: Type of connector + - config_used: Configuration that was used + - error: Error message if download failed (optional) + """ + pass diff --git a/application/parser/connectors/connector_creator.py b/application/parser/connectors/connector_creator.py index cefba7b4..bf4456ca 100644 --- a/application/parser/connectors/connector_creator.py +++ b/application/parser/connectors/connector_creator.py @@ -1,30 +1,35 @@ from application.parser.connectors.google_drive.loader import GoogleDriveLoader +from application.parser.connectors.google_drive.auth import GoogleDriveAuth class ConnectorCreator: """ - Factory class for creating external knowledge base connectors. - + Factory class for creating external knowledge base connectors and auth providers. + These are different from remote loaders as they typically require authentication and connect to external document storage systems. """ - + connectors = { "google_drive": GoogleDriveLoader, } + auth_providers = { + "google_drive": GoogleDriveAuth, + } + @classmethod def create_connector(cls, connector_type, *args, **kwargs): """ Create a connector instance for the specified type. - + Args: connector_type: Type of connector to create (e.g., 'google_drive') *args, **kwargs: Arguments to pass to the connector constructor - + Returns: Connector instance - + Raises: ValueError: If connector type is not supported """ @@ -33,11 +38,30 @@ class ConnectorCreator: raise ValueError(f"No connector class found for type {connector_type}") return connector_class(*args, **kwargs) + @classmethod + def create_auth(cls, connector_type): + """ + Create an auth provider instance for the specified connector type. + + Args: + connector_type: Type of connector auth to create (e.g., 'google_drive') + + Returns: + Auth provider instance + + Raises: + ValueError: If connector type is not supported for auth + """ + auth_class = cls.auth_providers.get(connector_type.lower()) + if not auth_class: + raise ValueError(f"No auth class found for type {connector_type}") + return auth_class() + @classmethod def get_supported_connectors(cls): """ Get list of supported connector types. - + Returns: List of supported connector type strings """ @@ -47,10 +71,10 @@ class ConnectorCreator: def is_supported(cls, connector_type): """ Check if a connector type is supported. - + Args: connector_type: Type of connector to check - + Returns: True if supported, False otherwise """ diff --git a/application/parser/connectors/google_drive/auth.py b/application/parser/connectors/google_drive/auth.py index c64e125e..01851104 100644 --- a/application/parser/connectors/google_drive/auth.py +++ b/application/parser/connectors/google_drive/auth.py @@ -8,9 +8,10 @@ from googleapiclient.discovery import build from googleapiclient.errors import HttpError from application.core.settings import settings +from application.parser.connectors.base import BaseConnectorAuth -class GoogleDriveAuth: +class GoogleDriveAuth(BaseConnectorAuth): """ Handles Google OAuth 2.0 authentication for Google Drive access. """ @@ -31,15 +32,6 @@ class GoogleDriveAuth: def get_authorization_url(self, state: Optional[str] = None) -> str: - """ - Generate Google OAuth authorization URL. - - Args: - state: Optional state parameter for CSRF protection - - Returns: - Authorization URL for Google OAuth flow - """ try: flow = Flow.from_client_config( { @@ -69,15 +61,6 @@ class GoogleDriveAuth: raise def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: - """ - Exchange authorization code for access and refresh tokens. - - Args: - authorization_code: Authorization code from OAuth callback - - Returns: - Dictionary containing token information - """ try: if not authorization_code: raise ValueError("Authorization code is required") diff --git a/application/parser/connectors/google_drive/loader.py b/application/parser/connectors/google_drive/loader.py index d782649c..a81ad4d4 100644 --- a/application/parser/connectors/google_drive/loader.py +++ b/application/parser/connectors/google_drive/loader.py @@ -11,12 +11,12 @@ from typing import List, Dict, Any, Optional from googleapiclient.http import MediaIoBaseDownload from googleapiclient.errors import HttpError -from application.parser.remote.base import BaseRemote +from application.parser.connectors.base import BaseConnectorLoader from application.parser.connectors.google_drive.auth import GoogleDriveAuth from application.parser.schema.base import Document -class GoogleDriveLoader(BaseRemote): +class GoogleDriveLoader(BaseConnectorLoader): SUPPORTED_MIME_TYPES = { 'application/pdf': '.pdf', @@ -104,25 +104,6 @@ class GoogleDriveLoader(BaseRemote): return None def load_data(self, inputs: Dict[str, Any]) -> List[Document]: - """ - Load items from Google Drive according to simple browsing semantics. - - Behavior: - - If file_ids are provided: return those files (optionally with content). - - If folder_id is provided: return the immediate children (folders and files) of that folder. - - If no folder_id: return the immediate children (folders and files) of Drive 'root'. - - Args: - inputs: Dictionary containing configuration: - - folder_id: Optional Google Drive folder ID whose direct children to list - - file_ids: Optional list of specific file IDs to load - - limit: Maximum number of items to return - - list_only: If True, only return metadata without content - - session_token: Optional session token to use for authentication (backward compatibility) - - Returns: - List of Document objects (folders are returned as metadata-only documents) - """ session_token = inputs.get('session_token') if session_token and session_token != self.session_token: logging.warning("Session token in inputs differs from loader's session token. Using loader's session token.") From 4065041a9fa4aacf16f4b7b3c50748d3df106bf6 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 28 Aug 2025 00:51:09 +0530 Subject: [PATCH 12/25] (feat:connectors) separate routes, namespace --- application/api/connector/routes.py | 535 +++++++++++++++++++++ application/api/user/routes.py | 499 ++----------------- application/app.py | 2 + frontend/public/google-drive-callback.html | 11 +- 4 files changed, 584 insertions(+), 463 deletions(-) create mode 100644 application/api/connector/routes.py diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py new file mode 100644 index 00000000..df4c73f4 --- /dev/null +++ b/application/api/connector/routes.py @@ -0,0 +1,535 @@ +import datetime +import json +import os +from functools import wraps +from bson.objectid import ObjectId +from flask import ( + Blueprint, + current_app, + jsonify, + make_response, + request +) +from flask_restx import fields, Namespace, Resource + + +from application.agents.tools.tool_manager import ToolManager + +from application.api.user.tasks import ( + ingest_connector_task, +) +from application.core.mongo_db import MongoDB +from application.core.settings import settings +from application.api import api +from application.storage.storage_creator import StorageCreator +from application.tts.google_tts import GoogleTTS +from application.utils import ( + check_required_fields +) +from application.utils import num_tokens_from_string +from application.vectorstore.vector_creator import VectorCreator +from application.parser.connectors.connector_creator import ConnectorCreator + +storage = StorageCreator.get_storage() + +mongo = MongoDB.get_client() +db = mongo[settings.MONGO_DB_NAME] +sources_collection = db["sources"] + +connector = Blueprint("connector", __name__) +connectors_ns = Namespace("connectors", description="Connector operations", path="/") +api.add_namespace(connectors_ns) + +current_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) + + +@connectors_ns.route("/api/connectors/upload") +class UploadConnector(Resource): + @api.expect( + api.model( + "ConnectorUploadModel", + { + "user": fields.String(required=True, description="User ID"), + "source": fields.String( + required=True, description="Source type (google_drive, github, etc.)" + ), + "name": fields.String(required=True, description="Job name"), + "data": fields.String(required=True, description="Configuration data"), + "repo_url": fields.String(description="GitHub repository URL"), + }, + ) + ) + @api.doc( + description="Uploads connector source for vectorization", + ) + def post(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + data = request.form + required_fields = ["user", "source", "name", "data"] + missing_fields = check_required_fields(data, required_fields) + if missing_fields: + return missing_fields + try: + config = json.loads(data["data"]) + source_data = None + + if data["source"] == "github": + source_data = config.get("repo_url") + elif data["source"] in ["crawler", "url"]: + source_data = config.get("url") + elif data["source"] == "reddit": + source_data = config + elif data["source"] in ConnectorCreator.get_supported_connectors(): + session_token = config.get("session_token") + if not session_token: + return make_response(jsonify({ + "success": False, + "error": f"Missing session_token in {data['source']} configuration" + }), 400) + + file_ids = config.get("file_ids", []) + if isinstance(file_ids, str): + file_ids = [id.strip() for id in file_ids.split(',') if id.strip()] + elif not isinstance(file_ids, list): + file_ids = [] + + folder_ids = config.get("folder_ids", []) + if isinstance(folder_ids, str): + folder_ids = [id.strip() for id in folder_ids.split(',') if id.strip()] + elif not isinstance(folder_ids, list): + folder_ids = [] + + config["file_ids"] = file_ids + config["folder_ids"] = folder_ids + + task = ingest_connector_task.delay( + job_name=data["name"], + user=decoded_token.get("sub"), + source_type=data["source"], + session_token=session_token, + file_ids=file_ids, + folder_ids=folder_ids, + recursive=config.get("recursive", False), + retriever=config.get("retriever", "classic") + ) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) + task = ingest_connector_task.delay( + source_data=source_data, + job_name=data["name"], + user=decoded_token.get("sub"), + loader=data["source"], + ) + except Exception as err: + current_app.logger.error( + f"Error uploading connector source: {err}", exc_info=True + ) + return make_response(jsonify({"success": False}), 400) + return make_response(jsonify({"success": True, "task_id": task.id}), 200) + + +@connectors_ns.route("/api/connectors/task_status") +class ConnectorTaskStatus(Resource): + task_status_model = api.model( + "ConnectorTaskStatusModel", + {"task_id": fields.String(required=True, description="Task ID")}, + ) + + @api.expect(task_status_model) + @api.doc(description="Get connector task status") + def get(self): + task_id = request.args.get("task_id") + if not task_id: + return make_response( + jsonify({"success": False, "message": "Task ID is required"}), 400 + ) + try: + from application.celery_init import celery + + task = celery.AsyncResult(task_id) + task_meta = task.info + print(f"Task status: {task.status}") + if not isinstance( + task_meta, (dict, list, str, int, float, bool, type(None)) + ): + task_meta = str(task_meta) + except Exception as err: + current_app.logger.error(f"Error getting task status: {err}", exc_info=True) + return make_response(jsonify({"success": False}), 400) + return make_response(jsonify({"status": task.status, "result": task_meta}), 200) + + +@connectors_ns.route("/api/connectors/sources") +class ConnectorSources(Resource): + @api.doc(description="Get connector sources") + def get(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + user = decoded_token.get("sub") + try: + sources = sources_collection.find({"user": user, "type": "connector"}).sort("date", -1) + connector_sources = [] + for source in sources: + connector_sources.append({ + "id": str(source["_id"]), + "name": source.get("name"), + "date": source.get("date"), + "type": source.get("type"), + "source": source.get("source"), + "tokens": source.get("tokens", ""), + "retriever": source.get("retriever", "classic"), + "syncFrequency": source.get("sync_frequency", ""), + }) + except Exception as err: + current_app.logger.error(f"Error retrieving connector sources: {err}", exc_info=True) + return make_response(jsonify({"success": False}), 400) + return make_response(jsonify(connector_sources), 200) + + +@connectors_ns.route("/api/connectors/delete") +class DeleteConnectorSource(Resource): + @api.doc( + description="Delete a connector source", + params={"source_id": "The source ID to delete"}, + ) + def delete(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + source_id = request.args.get("source_id") + if not source_id: + return make_response( + jsonify({"success": False, "message": "source_id is required"}), 400 + ) + try: + result = sources_collection.delete_one( + {"_id": ObjectId(source_id), "user": decoded_token.get("sub")} + ) + if result.deleted_count == 0: + return make_response( + jsonify({"success": False, "message": "Source not found"}), 404 + ) + except Exception as err: + current_app.logger.error( + f"Error deleting connector source: {err}", exc_info=True + ) + return make_response(jsonify({"success": False}), 400) + return make_response(jsonify({"success": True}), 200) + + +@connectors_ns.route("/api/connectors/auth") +class ConnectorAuth(Resource): + @api.doc(description="Get connector OAuth authorization URL", params={"provider": "Connector provider (e.g., google_drive)"}) + def get(self): + try: + provider = request.args.get('provider') or request.args.get('source') + if not provider: + return make_response(jsonify({"success": False, "error": "Missing provider"}), 400) + + if not ConnectorCreator.is_supported(provider): + return make_response(jsonify({"success": False, "error": f"Unsupported provider: {provider}"}), 400) + + import uuid + state = str(uuid.uuid4()) + auth = ConnectorCreator.create_auth(provider) + authorization_url = auth.get_authorization_url(state=state) + return make_response(jsonify({ + "success": True, + "authorization_url": authorization_url, + "state": state + }), 200) + except Exception as e: + current_app.logger.error(f"Error generating connector auth URL: {e}") + return make_response(jsonify({"success": False, "error": str(e)}), 500) + + +@connectors_ns.route("/api/connectors/callback") +class ConnectorsCallback(Resource): + @api.doc(description="Handle OAuth callback for external connectors") + def get(self): + """Handle OAuth callback for external connectors""" + try: + from application.parser.connectors.connector_creator import ConnectorCreator + from flask import request + import uuid + + authorization_code = request.args.get('code') + _ = request.args.get('state') + error = request.args.get('error') + + if error: + return make_response( + jsonify({"success": False, "error": f"OAuth error: {error}. Please try again and make sure to grant all requested permissions, including offline access."}), 400 + ) + + if not authorization_code: + return make_response( + jsonify({"success": False, "error": "Authorization code not provided. Please complete the authorization process and make sure to grant offline access."}), 400 + ) + + try: + auth = ConnectorCreator.create_auth("google_drive") + token_info = auth.exchange_code_for_tokens(authorization_code) + current_app.logger.info(f"Token info received from OAuth callback - has refresh_token: {bool(token_info.get('refresh_token'))}, " + f"has access_token: {bool(token_info.get('access_token'))}, " + f"expiry: {token_info.get('expiry')}") + + safe_token_info = {k: v for k, v in token_info.items() if k not in ['access_token', 'refresh_token', 'client_secret']} + current_app.logger.info(f"Full token info structure: {safe_token_info}") + + # Validate that we got token info + if not token_info: + current_app.logger.error("exchange_code_for_tokens returned None or empty result") + return make_response( + jsonify({"success": False, "error": "Failed to exchange authorization code for tokens. Please try again and make sure to grant all requested permissions, including offline access."}), 400 + ) + + # Validate required fields in token_info + required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] + missing_fields = [field for field in required_fields if not token_info.get(field)] + if missing_fields: + current_app.logger.error(f"Token info missing required fields: {missing_fields}") + return make_response( + jsonify({"success": False, "error": f"Token information incomplete. Missing fields: {missing_fields}. Please try again and make sure to grant all requested permissions."}), 400 + ) + + if not token_info.get('refresh_token'): + current_app.logger.warning("OAuth flow did not return a refresh token - user will need to re-authenticate when token expires") + + required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] + missing_fields = [field for field in required_fields if not token_info.get(field)] + if missing_fields: + current_app.logger.error(f"Token info missing required fields: {missing_fields}") + return make_response( + jsonify({"success": False, "error": f"Token info missing required fields: {missing_fields}"}), 400 + ) + + except Exception as e: + current_app.logger.error(f"Error exchanging code for tokens: {str(e)}", exc_info=True) + + if 'refresh' in str(e).lower(): + current_app.logger.warning(f"Missing refresh token but continuing: {str(e)}") + + else: + return make_response( + jsonify({"success": False, "error": f"Failed to exchange authorization code for tokens: {str(e)}"}), 400 + ) + + try: + credentials = auth.create_credentials_from_token_info(token_info) + service = auth.build_drive_service(credentials) + user_info = service.about().get(fields="user").execute() + user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + except Exception as e: + current_app.logger.warning(f"Could not get user info: {e}") + if token_info.get('access_token'): + try: + import requests + headers = {'Authorization': f'Bearer {token_info["access_token"]}'} + response = requests.get( + 'https://www.googleapis.com/drive/v3/about?fields=user', + headers=headers + ) + if response.status_code == 200: + user_info = response.json() + user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + else: + user_email = 'Connected User' + except Exception as request_error: + current_app.logger.warning(f"Could not get user info via direct request: {request_error}") + user_email = 'Connected User' + else: + user_email = 'Connected User' + + session_token = str(uuid.uuid4()) + + from application.core.mongo_db import MongoDB + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + sessions_collection = db["connector_sessions"] + + sanitized_token_info = { + "access_token": token_info.get("access_token"), + "refresh_token": token_info.get("refresh_token"), + "token_uri": token_info.get("token_uri"), + "expiry": token_info.get("expiry"), + "scopes": token_info.get("scopes") + } + + sessions_collection.insert_one({ + "session_token": session_token, + "token_info": sanitized_token_info, + "created_at": datetime.datetime.now(datetime.timezone.utc), + "user_email": user_email + }) + + return make_response( + jsonify({ + "success": True, + "message": "Google Drive authentication successful", + "session_token": session_token, + "user_email": user_email + }), + 200 + ) + + except Exception as e: + current_app.logger.error(f"Error handling connector callback: {e}") + return make_response( + jsonify({ + "success": False, + "error": f"Failed to complete connector authentication: {str(e)}. Please try again and make sure to grant all requested permissions, including offline access." + }), 500 + ) + + +@connectors_ns.route("/api/connectors/refresh") +class ConnectorRefresh(Resource): + @api.expect(api.model("ConnectorRefreshModel", {"provider": fields.String(required=True), "refresh_token": fields.String(required=True)})) + @api.doc(description="Refresh connector access token") + def post(self): + try: + data = request.get_json() + provider = data.get('provider') + refresh_token = data.get('refresh_token') + + if not provider or not refresh_token: + return make_response(jsonify({"success": False, "error": "provider and refresh_token are required"}), 400) + + auth = ConnectorCreator.create_auth(provider) + token_info = auth.refresh_access_token(refresh_token) + return make_response(jsonify({"success": True, "token_info": token_info}), 200) + except Exception as e: + current_app.logger.error(f"Error refreshing token for connector: {e}") + return make_response(jsonify({"success": False, "error": str(e)}), 500) + + +@connectors_ns.route("/api/connectors/files") +class ConnectorFiles(Resource): + @api.expect(api.model("ConnectorFilesModel", {"provider": fields.String(required=True), "session_token": fields.String(required=True), "folder_id": fields.String(required=False), "limit": fields.Integer(required=False)})) + @api.doc(description="List files from a connector provider") + def post(self): + try: + data = request.get_json() + provider = data.get('provider') + session_token = data.get('session_token') + folder_id = data.get('folder_id') + limit = data.get('limit', 50) + if not provider or not session_token: + return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) + + loader = ConnectorCreator.create_connector(provider, session_token) + documents = loader.load_data({ + 'limit': limit, + 'list_only': True, + 'session_token': session_token, + 'folder_id': folder_id + }) + + files = [] + for doc in documents[:limit]: + metadata = doc.extra_info + files.append({ + 'id': doc.doc_id, + 'name': metadata.get('file_name', 'Unknown File'), + 'type': metadata.get('mime_type', 'unknown'), + 'size': metadata.get('size', 'Unknown'), + 'modifiedTime': metadata.get('modified_time', 'Unknown'), + 'iconUrl': get_file_icon(metadata.get('mime_type', '')) + }) + + return make_response(jsonify({"success": True, "files": files, "total": len(files)}), 200) + except Exception as e: + current_app.logger.error(f"Error loading connector files: {e}") + return make_response(jsonify({"success": False, "error": f"Failed to load files: {str(e)}"}), 500) + + +@connectors_ns.route("/api/connectors/validate-session") +class ConnectorValidateSession(Resource): + @api.expect(api.model("ConnectorValidateSessionModel", {"provider": fields.String(required=True), "session_token": fields.String(required=True)})) + @api.doc(description="Validate connector session token and return user info") + def post(self): + try: + from application.core.mongo_db import MongoDB + data = request.get_json() + provider = data.get('provider') + session_token = data.get('session_token') + if not provider or not session_token: + return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) + + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + collection_name = "connector_sessions" + sessions_collection = db[collection_name] + + session = sessions_collection.find_one({"session_token": session_token}) + if not session or "token_info" not in session: + return make_response(jsonify({"success": False, "error": "Invalid or expired session"}), 401) + + token_info = session["token_info"] + auth = ConnectorCreator.create_auth(provider) + is_expired = auth.is_token_expired(token_info) + + return make_response(jsonify({ + "success": True, + "expired": is_expired, + "user_email": session.get('user_email', 'Connected User') + }), 200) + except Exception as e: + current_app.logger.error(f"Error validating connector session: {e}") + return make_response(jsonify({"success": False, "error": str(e)}), 500) + + +@connectors_ns.route("/api/connectors/disconnect") +class ConnectorDisconnect(Resource): + @api.expect(api.model("ConnectorDisconnectModel", {"provider": fields.String(required=True), "session_token": fields.String(required=False)})) + @api.doc(description="Disconnect a connector session") + def post(self): + try: + from application.core.mongo_db import MongoDB + data = request.get_json() + provider = data.get('provider') + session_token = data.get('session_token') + if not provider: + return make_response(jsonify({"success": False, "error": "provider is required"}), 400) + + mongo = MongoDB.get_client() + db = mongo[settings.MONGO_DB_NAME] + collection_name = "connector_sessions" + sessions_collection = db[collection_name] + + if session_token: + sessions_collection.delete_one({"session_token": session_token}) + + return make_response(jsonify({"success": True}), 200) + except Exception as e: + current_app.logger.error(f"Error disconnecting connector session: {e}") + return make_response(jsonify({"success": False, "error": str(e)}), 500) + + +def get_file_icon(mime_type): + """Return appropriate icon URL based on file MIME type""" + icon_map = { + 'application/vnd.google-apps.document': '/icons/google-docs.png', + 'application/vnd.google-apps.spreadsheet': '/icons/google-sheets.png', + 'application/vnd.google-apps.presentation': '/icons/google-slides.png', + 'application/pdf': '/icons/pdf.png', + 'text/plain': '/icons/text.png', + 'application/msword': '/icons/word.png', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '/icons/word.png', + 'application/vnd.ms-excel': '/icons/excel.png', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '/icons/excel.png', + 'application/vnd.ms-powerpoint': '/icons/powerpoint.png', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '/icons/powerpoint.png', + 'image/jpeg': '/icons/image.png', + 'image/png': '/icons/image.png', + 'image/gif': '/icons/image.png', + 'video/mp4': '/icons/video.png', + 'application/zip': '/icons/archive.png', + 'application/x-zip-compressed': '/icons/archive.png', + } + return icon_map.get(mime_type, '/icons/generic-file.png') diff --git a/application/api/user/routes.py b/application/api/user/routes.py index a016155b..0bf6aa2f 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -47,6 +47,7 @@ from application.utils import ( ) from application.utils import num_tokens_from_string from application.vectorstore.vector_creator import VectorCreator +from application.parser.connectors.connector_creator import ConnectorCreator storage = StorageCreator.get_storage() @@ -493,9 +494,9 @@ class DeleteOldIndexes(Resource): ) if not doc: return make_response(jsonify({"status": "not found"}), 404) - + storage = StorageCreator.get_storage() - + try: # Delete vector index if settings.VECTOR_STORE == "faiss": @@ -509,7 +510,7 @@ class DeleteOldIndexes(Resource): settings.VECTOR_STORE, source_id=str(doc["_id"]) ) vectorstore.delete_index() - + if "file_path" in doc and doc["file_path"]: file_path = doc["file_path"] if storage.is_directory(file_path): @@ -518,7 +519,7 @@ class DeleteOldIndexes(Resource): storage.delete_file(f) else: storage.delete_file(file_path) - + except FileNotFoundError: pass except Exception as err: @@ -526,7 +527,7 @@ class DeleteOldIndexes(Resource): f"Error deleting files and indexes: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) - + sources_collection.delete_one({"_id": ObjectId(source_id)}) return make_response(jsonify({"success": True}), 200) @@ -574,30 +575,30 @@ class UploadFile(Resource): try: storage = StorageCreator.get_storage() - - + + for file in files: original_filename = file.filename safe_file = safe_filename(original_filename) - + with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = os.path.join(temp_dir, safe_file) file.save(temp_file_path) - + if zipfile.is_zipfile(temp_file_path): try: with zipfile.ZipFile(temp_file_path, 'r') as zip_ref: zip_ref.extractall(path=temp_dir) - + # Walk through extracted files and upload them for root, _, files in os.walk(temp_dir): for extracted_file in files: if os.path.join(root, extracted_file) == temp_file_path: continue - + rel_path = os.path.relpath(os.path.join(root, extracted_file), temp_dir) storage_path = f"{base_path}/{rel_path}" - + with open(os.path.join(root, extracted_file), 'rb') as f: storage.save_file(f, storage_path) except Exception as e: @@ -611,7 +612,7 @@ class UploadFile(Resource): file_path = f"{base_path}/{safe_file}" with open(temp_file_path, 'rb') as f: storage.save_file(f, file_path) - + task = ingest.delay( settings.UPLOAD_FOLDER, [ @@ -687,8 +688,8 @@ class ManageSourceFiles(Resource): try: storage = StorageCreator.get_storage() source_file_path = source.get("file_path", "") - parent_dir = request.form.get("parent_dir", "") - + parent_dir = request.form.get("parent_dir", "") + if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir): return make_response( jsonify({"success": False, "message": "Invalid parent directory path"}), 400 @@ -702,7 +703,7 @@ class ManageSourceFiles(Resource): ) added_files = [] - + target_dir = source_file_path if parent_dir: target_dir = f"{source_file_path}/{parent_dir}" @@ -878,44 +879,35 @@ class UploadRemote(Resource): source_data = config.get("url") elif data["source"] == "reddit": source_data = config - elif data["source"] == "google_drive": - if "session_token" not in config: - return make_response(jsonify({ - "success": False, - "error": "Missing session_token in Google Drive configuration" - }), 400) - + elif data["source"] in ConnectorCreator.get_supported_connectors(): session_token = config.get("session_token") - + if not session_token: + return make_response(jsonify({ + "success": False, + "error": f"Missing session_token in {data['source']} configuration" + }), 400) + # Process file_ids file_ids = config.get("file_ids", []) if isinstance(file_ids, str): file_ids = [id.strip() for id in file_ids.split(',') if id.strip()] elif not isinstance(file_ids, list): file_ids = [] - + # Process folder_ids folder_ids = config.get("folder_ids", []) if isinstance(folder_ids, str): folder_ids = [id.strip() for id in folder_ids.split(',') if id.strip()] elif not isinstance(folder_ids, list): folder_ids = [] - - # Ensure at least one file or folder is selected - if not file_ids and not folder_ids: - return make_response(jsonify({ - "success": False, - "error": "No files or folders selected" - }), 400) - + config["file_ids"] = file_ids config["folder_ids"] = folder_ids - - from application.api.user.tasks import ingest_connector_task + task = ingest_connector_task.delay( job_name=data["name"], user=decoded_token.get("sub"), - source_type="google_drive", + source_type=data["source"], session_token=session_token, file_ids=file_ids, folder_ids=folder_ids, @@ -1453,7 +1445,7 @@ class CreateAgent(Resource): except json.JSONDecodeError: data["json_schema"] = None print(f"Received data: {data}") - + # Validate JSON schema if provided if data.get("json_schema"): try: @@ -1461,19 +1453,19 @@ class CreateAgent(Resource): json_schema = data.get("json_schema") if not isinstance(json_schema, dict): return make_response( - jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}), + jsonify({"success": False, "message": "JSON schema must be a valid JSON object"}), 400 ) - + # Validate that it has either a 'schema' property or is itself a schema if "schema" not in json_schema and "type" not in json_schema: return make_response( - jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}), + jsonify({"success": False, "message": "JSON schema must contain either a 'schema' property or be a valid JSON schema with 'type' property"}), 400 ) except Exception as e: return make_response( - jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}), + jsonify({"success": False, "message": f"Invalid JSON schema: {str(e)}"}), 400 ) @@ -3607,7 +3599,7 @@ class GetChunks(Resource): try: store = get_vector_store(doc_id) chunks = store.get_chunks() - + filtered_chunks = [] for chunk in chunks: metadata = chunk.get("metadata", {}) @@ -3628,9 +3620,9 @@ class GetChunks(Resource): continue filtered_chunks.append(chunk) - + chunks = filtered_chunks - + total_chunks = len(chunks) start = (page - 1) * per_page end = start + per_page @@ -3951,27 +3943,27 @@ class DirectoryStructure(Resource): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) - + user = decoded_token.get("sub") doc_id = request.args.get("id") - + if not doc_id: return make_response( jsonify({"error": "Document ID is required"}), 400 ) - + if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid document ID"}), 400) - + try: doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) - + directory_structure = doc.get("directory_structure", {}) - + return make_response( jsonify({ "success": True, @@ -3979,7 +3971,7 @@ class DirectoryStructure(Resource): "base_path": doc.get("file_path", "") }), 200 ) - + except Exception as e: current_app.logger.error( f"Error retrieving directory structure: {e}", exc_info=True @@ -3989,409 +3981,4 @@ class DirectoryStructure(Resource): ) -@user_ns.route("/api/google-drive/auth") -class GoogleDriveAuth(Resource): - @api.doc(description="Get Google Drive OAuth authorization URL") - def get(self): - """Get Google Drive OAuth authorization URL""" - try: - from application.parser.connectors.connector_creator import ConnectorCreator - auth = ConnectorCreator.create_auth("google_drive") - - # Generate state parameter for CSRF protection - import uuid - state = str(uuid.uuid4()) - - # Store state in session or database for validation - # For now, we'll include it in the URL and validate on callback - authorization_url = auth.get_authorization_url(state=state) - current_app.logger.info(f"Generated authorization URL: {authorization_url}") - return make_response( - jsonify({ - "success": True, - "authorization_url": authorization_url, - "state": state - }), - 200 - ) - - except Exception as e: - current_app.logger.error(f"Error generating Google Drive auth URL: {e}") - return make_response( - jsonify({"success": False, "error": str(e)}), 500 - ) - - -@user_ns.route("/api/google-drive/callback") -class GoogleDriveCallback(Resource): - @api.doc(description="Handle Google Drive OAuth callback") - def get(self): - """Handle Google Drive OAuth callback""" - try: - from application.parser.connectors.connector_creator import ConnectorCreator - from flask import request - import uuid - - # Get authorization code and state from query parameters - authorization_code = request.args.get('code') - _ = request.args.get('state') # We don't currently use state, but capture it to avoid unused variable warning - error = request.args.get('error') - - if error: - return make_response( - jsonify({"success": False, "error": f"OAuth error: {error}. Please try again and make sure to grant all requested permissions, including offline access."}), 400 - ) - - if not authorization_code: - return make_response( - jsonify({"success": False, "error": "Authorization code not provided. Please complete the authorization process and make sure to grant offline access."}), 400 - ) - - # Exchange code for tokens - try: - auth = ConnectorCreator.create_auth("google_drive") - token_info = auth.exchange_code_for_tokens(authorization_code) - - # Log detailed information about the token_info we received - current_app.logger.info(f"Token info received from OAuth callback - has refresh_token: {bool(token_info.get('refresh_token'))}, " - f"has access_token: {bool(token_info.get('access_token'))}, " - f"expiry: {token_info.get('expiry')}") - - # Log the full token_info structure (without sensitive data) - safe_token_info = {k: v for k, v in token_info.items() if k not in ['access_token', 'refresh_token', 'client_secret']} - current_app.logger.info(f"Full token info structure: {safe_token_info}") - - # Validate that we got token info - if not token_info: - current_app.logger.error("exchange_code_for_tokens returned None or empty result") - return make_response( - jsonify({"success": False, "error": "Failed to exchange authorization code for tokens. Please try again and make sure to grant all requested permissions, including offline access."}), 400 - ) - - # Validate required fields in token_info - required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] - missing_fields = [field for field in required_fields if not token_info.get(field)] - if missing_fields: - current_app.logger.error(f"Token info missing required fields: {missing_fields}") - return make_response( - jsonify({"success": False, "error": f"Token information incomplete. Missing fields: {missing_fields}. Please try again and make sure to grant all requested permissions."}), 400 - ) - - # Check if refresh_token is present - this is critical for long-term access - if not token_info.get('refresh_token'): - return make_response( - jsonify({ - "success": False, - "error": "OAuth flow did not return a refresh token. This typically happens when offline access wasn't granted. " - "Please reconnect your Google Drive account and ensure you grant offline access when prompted. " - "Make sure to check 'Allow offline access' during the authorization process." - }), 400 - ) - - # Validate required fields in token_info - required_fields = ['access_token', 'token_uri', 'client_id', 'client_secret'] - missing_fields = [field for field in required_fields if not token_info.get(field)] - if missing_fields: - current_app.logger.error(f"Token info missing required fields: {missing_fields}") - return make_response( - jsonify({"success": False, "error": f"Token info missing required fields: {missing_fields}"}), 400 - ) - - except Exception as e: - current_app.logger.error(f"Error exchanging code for tokens: {e}", exc_info=True) - return make_response( - jsonify({"success": False, "error": f"Failed to exchange authorization code for tokens: {str(e)}"}), 400 - ) - - # Get user information - try: - credentials = auth.create_credentials_from_token_info(token_info) - service = auth.build_drive_service(credentials) - user_info = service.about().get(fields="user").execute() - user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') - except Exception as e: - current_app.logger.warning(f"Could not get user info: {e}") - # Try to get user info without building service if we have access token - if token_info.get('access_token'): - try: - import requests - headers = {'Authorization': f'Bearer {token_info["access_token"]}'} - response = requests.get( - 'https://www.googleapis.com/drive/v3/about?fields=user', - headers=headers - ) - if response.status_code == 200: - user_info = response.json() - user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') - else: - user_email = 'Connected User' - except Exception as request_error: - current_app.logger.warning(f"Could not get user info via direct request: {request_error}") - user_email = 'Connected User' - else: - user_email = 'Connected User' - - # Generate a session token - session_token = str(uuid.uuid4()) - - # Store token_info in MongoDB - from application.core.mongo_db import MongoDB - mongo = MongoDB.get_client() - db = mongo[settings.MONGO_DB_NAME] - sessions_collection = db["drive_sessions"] - - # Store only necessary token info, removing sensitive fields - sanitized_token_info = { - "access_token": token_info.get("access_token"), - "refresh_token": token_info.get("refresh_token"), - "token_uri": token_info.get("token_uri"), - "expiry": token_info.get("expiry"), - "scopes": token_info.get("scopes") - } - - # Store the sanitized token info with the session token - sessions_collection.insert_one({ - "session_token": session_token, - "token_info": sanitized_token_info, - "created_at": datetime.datetime.now(datetime.timezone.utc), - "user_email": user_email - }) - - # Return only the session token and user email to the client - return make_response( - jsonify({ - "success": True, - "message": "Google Drive authentication successful", - "session_token": session_token, - "user_email": user_email - }), - 200 - ) - - except Exception as e: - current_app.logger.error(f"Error handling Google Drive callback: {e}") - return make_response( - jsonify({ - "success": False, - "error": f"Failed to complete Google Drive authentication: {str(e)}. Please try again and make sure to grant all requested permissions, including offline access." - }), 500 - ) - - -@user_ns.route("/api/google-drive/refresh") -class GoogleDriveRefresh(Resource): - @api.expect( - api.model( - "GoogleDriveRefreshModel", - { - "refresh_token": fields.String(required=True, description="Refresh token") - } - ) - ) - @api.doc(description="Refresh Google Drive access token") - def post(self): - """Refresh Google Drive access token""" - try: - from application.parser.connectors.connector_creator import ConnectorCreator - - data = request.get_json() - refresh_token = data.get('refresh_token') - - if not refresh_token: - return make_response( - jsonify({"success": False, "error": "Refresh token not provided"}), 400 - ) - - auth = ConnectorCreator.create_auth("google_drive") - token_info = auth.refresh_access_token(refresh_token) - - return make_response( - jsonify({ - "success": True, - "message": "Token refreshed successfully", - "token_info": token_info - }), - 200 - ) - - except Exception as e: - current_app.logger.error(f"Error refreshing Google Drive token: {e}") - return make_response( - jsonify({ - "success": False, - "error": f"Failed to refresh Google Drive token: {str(e)}. Please reconnect your Google Drive account and make sure to grant offline access." - }), 500 - ) - - -@user_ns.route("/api/google-drive/files") -class GoogleDriveFiles(Resource): - @api.expect( - api.model( - "GoogleDriveFilesModel", - { - "session_token": fields.String(required=True, description="Google Drive session token"), - "folder_id": fields.String(description="Google Drive folder ID to fetch files from. If not provided, fetches from root", required=False), - "limit": fields.Integer(description="Maximum number of files to return", default=50) - } - ) - ) - @api.doc(description="Get list of files from Google Drive") - def post(self): - """Get list of files from Google Drive""" - try: - from application.parser.connectors.connector_creator import ConnectorCreator - - data = request.get_json() - session_token = data.get('session_token') - folder_id = data.get('folder_id') - limit = data.get('limit', 50) - - if not session_token: - return make_response( - jsonify({"success": False, "error": "Session token not provided"}), 400 - ) - - # Create Google Drive loader with session token only - loader = ConnectorCreator.create_connector("google_drive", session_token) - - # Get files from Google Drive (limit to first N files, metadata only) - files_config = { - 'limit': limit, - 'list_only': True, - 'session_token': session_token, - 'folder_id': folder_id - } - documents = loader.load_data(files_config) - - # Convert documents to file list format - files = [] - for doc in documents[:limit]: - # Use extra_info instead of doc_metadata - metadata = doc.extra_info - files.append({ - 'id': doc.doc_id, - 'name': metadata.get('file_name', 'Unknown File'), - 'type': metadata.get('mime_type', 'unknown'), - 'size': metadata.get('size', 'Unknown'), - 'modifiedTime': metadata.get('modified_time', 'Unknown'), - 'iconUrl': get_file_icon(metadata.get('mime_type', '')) - }) - - return make_response( - jsonify({ - "success": True, - "files": files, - "total": len(files) - }), - 200 - ) - - except Exception as e: - current_app.logger.error(f"Error loading Google Drive files: {e}") - return make_response( - jsonify({ - "success": False, - "error": f"Failed to load files: {str(e)}. Please make sure your Google Drive account is properly connected and you granted offline access during authorization." - }), 500 - ) - -def get_file_icon(mime_type: str) -> str: - """Get appropriate icon for file type""" - if 'pdf' in mime_type: - return '📄' - elif 'word' in mime_type or 'document' in mime_type: - return '📝' - elif 'presentation' in mime_type or 'powerpoint' in mime_type: - return '📊' - elif 'spreadsheet' in mime_type or 'excel' in mime_type: - return '📈' - elif 'text' in mime_type: - return '📄' - elif 'image' in mime_type: - return '🖼️' - else: - return '📄' - -@user_ns.route("/api/google-drive/validate-session") -class GoogleDriveValidateSession(Resource): - @api.expect( - api.model( - "GoogleDriveValidateSessionModel", - { - "session_token": fields.String(required=True, description="Google Drive session token") - } - ) - ) - @api.doc(description="Validate Google Drive session token") - def post(self): - """Validate Google Drive session token and return user info""" - try: - from application.core.mongo_db import MongoDB - from application.parser.connectors.connector_creator import ConnectorCreator - - data = request.get_json() - session_token = data.get('session_token') - - if not session_token: - return make_response( - jsonify({"success": False, "error": "Session token not provided"}), 400 - ) - - # Retrieve session from MongoDB using session token - mongo = MongoDB.get_client() - db = mongo[settings.MONGO_DB_NAME] - sessions_collection = db["drive_sessions"] - - session = sessions_collection.find_one({"session_token": session_token}) - if not session or "token_info" not in session: - return make_response( - jsonify({"success": False, "error": "Invalid or expired session"}), 401 - ) - - # Get token info and check if it's expired - token_info = session["token_info"] - auth = ConnectorCreator.create_auth("google_drive") - - # Check if token is expired using our improved method - is_expired = auth.is_token_expired(token_info) - - # Attempt to refresh token if needed - if is_expired and 'refresh_token' in token_info: - try: - current_app.logger.info("Refreshing expired Google Drive token") - refreshed_token_info = auth.refresh_access_token(token_info['refresh_token']) - - # Update token in database - sessions_collection.update_one( - {"session_token": session_token}, - {"$set": {"token_info": refreshed_token_info}} - ) - - # Use the refreshed token info - token_info = refreshed_token_info - except Exception as e: - current_app.logger.error(f"Error refreshing token: {e}", exc_info=True) - return make_response( - jsonify({"success": False, "error": "Session expired and could not be refreshed"}), 401 - ) - - # Return success with user email - return make_response( - jsonify({ - "success": True, - "user_email": session.get("user_email", "Connected User"), - "message": "Session is valid" - }), - 200 - ) - - except Exception as e: - current_app.logger.error(f"Error validating Google Drive session: {e}", exc_info=True) - return make_response( - jsonify({ - "success": False, - "error": f"Failed to validate session: {str(e)}. Please reconnect your Google Drive account and make sure to grant offline access during authorization." - }), 500 - ) diff --git a/application/app.py b/application/app.py index 4159a2bb..489ec840 100644 --- a/application/app.py +++ b/application/app.py @@ -16,6 +16,7 @@ from application.api import api # noqa: E402 from application.api.answer import answer # noqa: E402 from application.api.internal.routes import internal # noqa: E402 from application.api.user.routes import user # noqa: E402 +from application.api.connector.routes import connector # noqa: E402 from application.celery_init import celery # noqa: E402 from application.core.settings import settings # noqa: E402 @@ -30,6 +31,7 @@ app = Flask(__name__) app.register_blueprint(user) app.register_blueprint(answer) app.register_blueprint(internal) +app.register_blueprint(connector) app.config.update( UPLOAD_FOLDER="inputs", CELERY_BROKER_URL=settings.CELERY_BROKER_URL, diff --git a/frontend/public/google-drive-callback.html b/frontend/public/google-drive-callback.html index 0272af9a..d2113624 100644 --- a/frontend/public/google-drive-callback.html +++ b/frontend/public/google-drive-callback.html @@ -70,10 +70,10 @@ } try { - // Exchange code for tokens - // Use the backend API URL directly since this is a static HTML file const backendApiUrl = window.location.protocol + '//' + window.location.hostname + ':7091'; - const response = await fetch(backendApiUrl + '/api/google-drive/callback?' + window.location.search.substring(1)); + const urlParams = new URLSearchParams(window.location.search); + urlParams.set('provider', 'google_drive'); + const response = await fetch(backendApiUrl + '/api/connectors/callback?' + urlParams.toString()); const data = await response.json(); if (data.success) { @@ -81,15 +81,13 @@ if (data.session_token) { localStorage.setItem('google_drive_session_token', data.session_token); } - - // Extract user email let userEmail = data.user_email || 'Connected User'; statusDiv.className = 'success'; statusDiv.innerHTML = `Authentication successful as ${userEmail}!

You can close this window. Your Google Drive is now connected and ready to use.`; - // Notify parent window with session token instead of token_info + if (window.opener) { window.opener.postMessage({ type: 'google_drive_auth_success', @@ -110,7 +108,6 @@ } } - // Run when page loads handleCallback(); From f39ac9945fd7d89d1852970c4ee6b98b1a689e2a Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 28 Aug 2025 00:53:19 +0530 Subject: [PATCH 13/25] (feat:auth) follow connector-session --- application/parser/connectors/google_drive/auth.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/application/parser/connectors/google_drive/auth.py b/application/parser/connectors/google_drive/auth.py index 01851104..5a903653 100644 --- a/application/parser/connectors/google_drive/auth.py +++ b/application/parser/connectors/google_drive/auth.py @@ -215,9 +215,9 @@ class GoogleDriveAuth(BaseConnectorAuth): mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] - sessions_collection = db["drive_sessions"] - - session = sessions_collection.find_one({"session_token": session_token}) + + sessions_collection = db["connector_sessions"] + session = sessions_collection.find_one({"session_token": session_token}) if not session: raise ValueError(f"Invalid session token: {session_token}") From 018273c6b296db0ce4e9461435804051dc0eac89 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 29 Aug 2025 01:06:40 +0530 Subject: [PATCH 14/25] (feat:connector) refactor, updated routes FE --- application/api/connector/routes.py | 44 ++----- frontend/src/components/ConnectorAuth.tsx | 112 +++++++++++++++++ frontend/src/upload/Upload.tsx | 146 +++++----------------- 3 files changed, 150 insertions(+), 152 deletions(-) create mode 100644 frontend/src/components/ConnectorAuth.tsx diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py index df4c73f4..bcfa634d 100644 --- a/application/api/connector/routes.py +++ b/application/api/connector/routes.py @@ -1,7 +1,7 @@ import datetime import json -import os -from functools import wraps + + from bson.objectid import ObjectId from flask import ( Blueprint, @@ -13,7 +13,7 @@ from flask import ( from flask_restx import fields, Namespace, Resource -from application.agents.tools.tool_manager import ToolManager + from application.api.user.tasks import ( ingest_connector_task, @@ -21,16 +21,16 @@ from application.api.user.tasks import ( from application.core.mongo_db import MongoDB from application.core.settings import settings from application.api import api -from application.storage.storage_creator import StorageCreator -from application.tts.google_tts import GoogleTTS + + from application.utils import ( check_required_fields ) -from application.utils import num_tokens_from_string -from application.vectorstore.vector_creator import VectorCreator + + from application.parser.connectors.connector_creator import ConnectorCreator -storage = StorageCreator.get_storage() + mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] @@ -40,9 +40,6 @@ connector = Blueprint("connector", __name__) connectors_ns = Namespace("connectors", description="Connector operations", path="/") api.add_namespace(connectors_ns) -current_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -) @connectors_ns.route("/api/connectors/upload") @@ -438,8 +435,7 @@ class ConnectorFiles(Resource): 'name': metadata.get('file_name', 'Unknown File'), 'type': metadata.get('mime_type', 'unknown'), 'size': metadata.get('size', 'Unknown'), - 'modifiedTime': metadata.get('modified_time', 'Unknown'), - 'iconUrl': get_file_icon(metadata.get('mime_type', '')) + 'modifiedTime': metadata.get('modified_time', 'Unknown') }) return make_response(jsonify({"success": True, "files": files, "total": len(files)}), 200) @@ -511,25 +507,3 @@ class ConnectorDisconnect(Resource): return make_response(jsonify({"success": False, "error": str(e)}), 500) -def get_file_icon(mime_type): - """Return appropriate icon URL based on file MIME type""" - icon_map = { - 'application/vnd.google-apps.document': '/icons/google-docs.png', - 'application/vnd.google-apps.spreadsheet': '/icons/google-sheets.png', - 'application/vnd.google-apps.presentation': '/icons/google-slides.png', - 'application/pdf': '/icons/pdf.png', - 'text/plain': '/icons/text.png', - 'application/msword': '/icons/word.png', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '/icons/word.png', - 'application/vnd.ms-excel': '/icons/excel.png', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '/icons/excel.png', - 'application/vnd.ms-powerpoint': '/icons/powerpoint.png', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '/icons/powerpoint.png', - 'image/jpeg': '/icons/image.png', - 'image/png': '/icons/image.png', - 'image/gif': '/icons/image.png', - 'video/mp4': '/icons/video.png', - 'application/zip': '/icons/archive.png', - 'application/x-zip-compressed': '/icons/archive.png', - } - return icon_map.get(mime_type, '/icons/generic-file.png') diff --git a/frontend/src/components/ConnectorAuth.tsx b/frontend/src/components/ConnectorAuth.tsx new file mode 100644 index 00000000..22566521 --- /dev/null +++ b/frontend/src/components/ConnectorAuth.tsx @@ -0,0 +1,112 @@ +import React, { useRef } from 'react'; +import { useSelector } from 'react-redux'; +import { selectToken } from '../preferences/preferenceSlice'; + +interface ConnectorAuthProps { + provider: string; + onSuccess: (data: { session_token: string; user_email: string }) => void; + onError: (error: string) => void; + label?: string; +} + +const providerLabel = (provider: string) => { + const map: Record = { + google_drive: 'Google Drive', + }; + return map[provider] || provider.replace(/_/g, ' '); +}; + +const ConnectorAuth: React.FC = ({ provider, onSuccess, onError, label }) => { + const token = useSelector(selectToken); + const completedRef = useRef(false); + const intervalRef = useRef(null); + + const cleanup = () => { + if (intervalRef.current) { + clearInterval(intervalRef.current); + intervalRef.current = null; + } + window.removeEventListener('message', handleAuthMessage as any); + }; + + const handleAuthMessage = (event: MessageEvent) => { + const successGeneric = event.data?.type === 'connector_auth_success'; + const successProvider = event.data?.type === `${provider}_auth_success` || event.data?.type === 'google_drive_auth_success'; + const errorProvider = event.data?.type === `${provider}_auth_error` || event.data?.type === 'google_drive_auth_error'; + + if (successGeneric || successProvider) { + completedRef.current = true; + cleanup(); + onSuccess({ + session_token: event.data.session_token, + user_email: event.data.user_email || 'Connected User', + }); + } else if (errorProvider) { + completedRef.current = true; + cleanup(); + onError(event.data.error || 'Authentication failed'); + } + }; + + const handleAuth = async () => { + try { + completedRef.current = false; + cleanup(); + + const apiHost = import.meta.env.VITE_API_HOST; + const authResponse = await fetch(`${apiHost}/api/connectors/auth?provider=${provider}`, { + headers: { Authorization: `Bearer ${token}` }, + }); + + if (!authResponse.ok) { + throw new Error(`Failed to get authorization URL: ${authResponse.status}`); + } + + const authData = await authResponse.json(); + if (!authData.success || !authData.authorization_url) { + throw new Error(authData.error || 'Failed to get authorization URL'); + } + + const authWindow = window.open( + authData.authorization_url, + `${provider}-auth`, + 'width=500,height=600,scrollbars=yes,resizable=yes' + ); + if (!authWindow) { + throw new Error('Failed to open authentication window. Please allow popups.'); + } + + window.addEventListener('message', handleAuthMessage as any); + + const checkClosed = window.setInterval(() => { + if (authWindow.closed) { + clearInterval(checkClosed); + window.removeEventListener('message', handleAuthMessage as any); + if (!completedRef.current) { + onError('Authentication was cancelled'); + } + } + }, 1000); + intervalRef.current = checkClosed; + } catch (error) { + onError(error instanceof Error ? error.message : 'Authentication failed'); + } + }; + + const buttonLabel = label || `Connect ${providerLabel(provider)}`; + + return ( + + ); +}; + +export default ConnectorAuth; + diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index c2cf87ec..8020a0d5 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -27,6 +27,7 @@ import { } from './types/ingestor'; import FileIcon from '../assets/file.svg'; import FolderIcon from '../assets/folder.svg'; +import ConnectorAuth from '../components/ConnectorAuth'; function Upload({ receivedFile = [], @@ -329,8 +330,7 @@ function Upload({ data?.find( (d: Doc) => d.type?.toLowerCase() === 'local', ), - ), - ); + )); }); setProgress( (progress) => @@ -514,13 +514,13 @@ function Upload({ try { const apiHost = import.meta.env.VITE_API_HOST; - const validateResponse = await fetch(`${apiHost}/api/google-drive/validate-session`, { + const validateResponse = await fetch(`${apiHost}/api/connectors/validate-session`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` }, - body: JSON.stringify({ session_token: sessionToken }) + body: JSON.stringify({ provider: 'google_drive', session_token: sessionToken }) }); if (!validateResponse.ok) { @@ -547,94 +547,6 @@ function Upload({ } }; - const handleGoogleDriveConnect = async () => { - console.log('Google Drive connect button clicked'); - setIsAuthenticating(true); - setAuthError(''); - - const existingToken = localStorage.getItem('google_drive_session_token'); - if (existingToken) { - fetchUserEmailAndLoadFiles(existingToken); - setIsAuthenticating(false); - return; - } - - try { - const apiHost = import.meta.env.VITE_API_HOST; - - const authResponse = await fetch(`${apiHost}/api/google-drive/auth`, { - headers: { - 'Authorization': `Bearer ${token}` - } - }); - - if (!authResponse.ok) { - throw new Error(`Failed to get authorization URL: ${authResponse.status}`); - } - - const authData = await authResponse.json(); - - if (!authData.success || !authData.authorization_url) { - throw new Error(authData.error || 'Failed to get authorization URL'); - } - - console.log('Opening Google OAuth window...'); - - const authWindow = window.open( - authData.authorization_url, - 'google-drive-auth', - 'width=500,height=600,scrollbars=yes,resizable=yes' - ); - - if (!authWindow) { - throw new Error('Failed to open authentication window. Please allow popups.'); - } - - const handleAuthMessage = (event: MessageEvent) => { - console.log('Received message event:', event.data); - - if (event.data.type === 'google_drive_auth_success') { - console.log('OAuth success received:', event.data); - setUserEmail(event.data.user_email || 'Connected User'); - setIsGoogleDriveConnected(true); - setIsAuthenticating(false); - setAuthError(''); - - if (event.data.session_token) { - localStorage.setItem('google_drive_session_token', event.data.session_token); - } - - window.removeEventListener('message', handleAuthMessage); - - loadGoogleDriveFiles(event.data.session_token, null); - } else if (event.data.type === 'google_drive_auth_error') { - console.error('OAuth error received:', event.data); - setAuthError(event.data.error || 'Authentication failed. Please make sure to grant all requested permissions, including offline access. You may need to revoke previous access and re-authorize.'); - setIsAuthenticating(false); - setIsGoogleDriveConnected(false); - window.removeEventListener('message', handleAuthMessage); - } - }; - - window.addEventListener('message', handleAuthMessage); - const checkClosed = setInterval(() => { - if (authWindow.closed) { - clearInterval(checkClosed); - window.removeEventListener('message', handleAuthMessage); - - if (!isGoogleDriveConnected && !isAuthenticating) { - setAuthError('Authentication was cancelled'); - } - } - }, 1000); - - } catch (error) { - console.error('Error during Google Drive authentication:', error); - setAuthError(error instanceof Error ? error.message : 'Authentication failed'); - setIsAuthenticating(false); - } - }; - const loadGoogleDriveFiles = async (sessionToken: string, folderId?: string | null) => { setIsLoadingFiles(true); @@ -648,13 +560,13 @@ function Upload({ requestBody.folder_id = folderId; } - const filesResponse = await fetch(`${apiHost}/api/google-drive/files`, { + const filesResponse = await fetch(`${apiHost}/api/connectors/files`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` }, - body: JSON.stringify(requestBody) + body: JSON.stringify({ ...requestBody, provider: 'google_drive' }) }); if (!filesResponse.ok) { @@ -919,7 +831,7 @@ function Upload({ {files.map((file) => (

{file.name} @@ -973,25 +885,25 @@ function Upload({ )} {!isGoogleDriveConnected ? ( - + { + setUserEmail(data.user_email); + setIsGoogleDriveConnected(true); + setIsAuthenticating(false); + setAuthError(''); + + if (data.session_token) { + localStorage.setItem('google_drive_session_token', data.session_token); + loadGoogleDriveFiles(data.session_token, null); + } + }} + onError={(error) => { + setAuthError(error); + setIsAuthenticating(false); + setIsGoogleDriveConnected(false); + }} + /> ) : (

{/* Connection Status */} @@ -1013,13 +925,13 @@ function Upload({ setAuthError(''); const apiHost = import.meta.env.VITE_API_HOST; - fetch(`${apiHost}/api/google-drive/disconnect`, { + fetch(`${apiHost}/api/connectors/disconnect`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` }, - body: JSON.stringify({ session_token: localStorage.getItem('google_drive_session_token') }) + body: JSON.stringify({ provider: 'google_drive', session_token: localStorage.getItem('google_drive_session_token') }) }).catch(err => console.error('Error disconnecting from Google Drive:', err)); }} className="text-white hover:text-gray-200 text-xs underline" @@ -1111,7 +1023,7 @@ function Upload({ )}

Date: Fri, 29 Aug 2025 02:13:51 +0530 Subject: [PATCH 15/25] (feat:connector,auth) consider user_id --- application/api/connector/routes.py | 34 ++++++++++++++++------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py index bcfa634d..65a5d8c5 100644 --- a/application/api/connector/routes.py +++ b/application/api/connector/routes.py @@ -35,6 +35,7 @@ from application.parser.connectors.connector_creator import ConnectorCreator mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] sources_collection = db["sources"] +sessions_collection = db["connector_sessions"] connector = Blueprint("connector", __name__) connectors_ns = Namespace("connectors", description="Connector operations", path="/") @@ -344,10 +345,7 @@ class ConnectorsCallback(Resource): session_token = str(uuid.uuid4()) - from application.core.mongo_db import MongoDB - mongo = MongoDB.get_client() - db = mongo[settings.MONGO_DB_NAME] - sessions_collection = db["connector_sessions"] + sanitized_token_info = { "access_token": token_info.get("access_token"), @@ -357,8 +355,10 @@ class ConnectorsCallback(Resource): "scopes": token_info.get("scopes") } + user_id = request.decoded_token.get("sub") if getattr(request, "decoded_token", None) else None sessions_collection.insert_one({ "session_token": session_token, + "user": user_id, "token_info": sanitized_token_info, "created_at": datetime.datetime.now(datetime.timezone.utc), "user_email": user_email @@ -419,6 +419,15 @@ class ConnectorFiles(Resource): if not provider or not session_token: return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) + + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False, "error": "Unauthorized"}), 401) + user = decoded_token.get('sub') + session = sessions_collection.find_one({"session_token": session_token, "user": user}) + if not session: + return make_response(jsonify({"success": False, "error": "Invalid or unauthorized session"}), 401) + loader = ConnectorCreator.create_connector(provider, session_token) documents = loader.load_data({ 'limit': limit, @@ -450,19 +459,19 @@ class ConnectorValidateSession(Resource): @api.doc(description="Validate connector session token and return user info") def post(self): try: - from application.core.mongo_db import MongoDB data = request.get_json() provider = data.get('provider') session_token = data.get('session_token') if not provider or not session_token: return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) - mongo = MongoDB.get_client() - db = mongo[settings.MONGO_DB_NAME] - collection_name = "connector_sessions" - sessions_collection = db[collection_name] - session = sessions_collection.find_one({"session_token": session_token}) + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False, "error": "Unauthorized"}), 401) + user = decoded_token.get('sub') + + session = sessions_collection.find_one({"session_token": session_token, "user": user}) if not session or "token_info" not in session: return make_response(jsonify({"success": False, "error": "Invalid or expired session"}), 401) @@ -486,17 +495,12 @@ class ConnectorDisconnect(Resource): @api.doc(description="Disconnect a connector session") def post(self): try: - from application.core.mongo_db import MongoDB data = request.get_json() provider = data.get('provider') session_token = data.get('session_token') if not provider: return make_response(jsonify({"success": False, "error": "provider is required"}), 400) - mongo = MongoDB.get_client() - db = mongo[settings.MONGO_DB_NAME] - collection_name = "connector_sessions" - sessions_collection = db[collection_name] if session_token: sessions_collection.delete_one({"session_token": session_token}) From 2868e47cf8cdf92d4a3e783fc3bbcdee144a0d9a Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Fri, 29 Aug 2025 18:05:58 +0530 Subject: [PATCH 16/25] (feat:connector) provider metadata, separate fe nested display --- application/api/user/routes.py | 6 +- application/worker.py | 7 +- frontend/src/components/ConnectorTree.tsx | 526 ++++++++++++++++++++++ 3 files changed, 535 insertions(+), 4 deletions(-) create mode 100644 frontend/src/components/ConnectorTree.tsx diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 0bf6aa2f..15024545 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -1022,7 +1022,8 @@ class PaginatedSources(Resource): "tokens": doc.get("tokens", ""), "retriever": doc.get("retriever", "classic"), "syncFrequency": doc.get("sync_frequency", ""), - "isNested": bool(doc.get("directory_structure")) + "isNested": bool(doc.get("directory_structure")), + "type": doc.get("type", "file") } paginated_docs.append(doc_data) response = { @@ -1070,7 +1071,8 @@ class CombinedJson(Resource): "tokens": index.get("tokens", ""), "retriever": index.get("retriever", "classic"), "syncFrequency": index.get("sync_frequency", ""), - "is_nested": bool(index.get("directory_structure")) + "is_nested": bool(index.get("directory_structure")), + "type": index.get("type", "file") # Add type field with default "file" } ) except Exception as err: diff --git a/application/worker.py b/application/worker.py index 719ebccc..e231474c 100755 --- a/application/worker.py +++ b/application/worker.py @@ -981,8 +981,11 @@ def ingest_connector( "tokens": tokens, "retriever": retriever, "id": str(id), - "type": source_type, - "remote_data": json.dumps(api_source_config), + "type": "connector", + "remote_data": json.dumps({ + "provider": source_type, + **api_source_config + }), "directory_structure": json.dumps(directory_structure) } diff --git a/frontend/src/components/ConnectorTree.tsx b/frontend/src/components/ConnectorTree.tsx new file mode 100644 index 00000000..cee07aa4 --- /dev/null +++ b/frontend/src/components/ConnectorTree.tsx @@ -0,0 +1,526 @@ +import React, { useState, useRef, useEffect } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useSelector } from 'react-redux'; +import { selectToken } from '../preferences/preferenceSlice'; +import Chunks from './Chunks'; +import ContextMenu, { MenuOption } from './ContextMenu'; +import userService from '../api/services/userService'; +import FileIcon from '../assets/file.svg'; +import FolderIcon from '../assets/folder.svg'; +import ArrowLeft from '../assets/arrow-left.svg'; +import ThreeDots from '../assets/three-dots.svg'; +import EyeView from '../assets/eye-view.svg'; +import { useOutsideAlerter } from '../hooks'; + +interface ConnectorFileNode { + id: string; + name: string; + type: string; + size: string; + modifiedTime: string; + token_count?: number; + mimeType?: string; + isFolder?: boolean; +} + +interface ConnectorDirectoryStructure { + [key: string]: ConnectorFileNode; +} + +interface ConnectorTreeProps { + docId: string; + sourceName: string; + onBackToDocuments: () => void; +} + +interface SearchResult { + name: string; + path: string; + isFile: boolean; + id: string; +} + +const ConnectorTree: React.FC = ({ + docId, + sourceName, + onBackToDocuments, +}) => { + const { t } = useTranslation(); + const [directoryStructure, setDirectoryStructure] = useState(null); + const [currentPath, setCurrentPath] = useState([]); + const token = useSelector(selectToken); + const [selectedFile, setSelectedFile] = useState<{ id: string; name: string } | null>(null); + const [activeMenuId, setActiveMenuId] = useState(null); + const menuRefs = useRef<{ [key: string]: React.RefObject }>({}); + const [searchQuery, setSearchQuery] = useState(''); + const [searchResults, setSearchResults] = useState([]); + const searchDropdownRef = useRef(null); + + useOutsideAlerter( + searchDropdownRef, + () => { + setSearchQuery(''); + setSearchResults([]); + }, + [], + false, + ); + + + + useEffect(() => { + const fetchDirectoryStructure = async () => { + try { + const response = await userService.getDirectoryStructure(docId, token); + const data = await response.json(); + + if (data && data.directory_structure) { + const structure: ConnectorDirectoryStructure = {}; + // Convert the directory structure to our format + Object.entries(data.directory_structure).forEach(([key, value]: [string, any]) => { + structure[key] = { + id: key, + name: key, + type: value.type || 'file', + size: value.size_bytes ? `${value.size_bytes} bytes` : '-', + modifiedTime: '-', + token_count: value.token_count, + isFolder: !value.type, + }; + }); + setDirectoryStructure(structure); + + // Update search results when directory structure changes + if (searchQuery && structure) { + setSearchResults(searchFiles(searchQuery, structure)); + } + } else { + // Handle invalid response format + console.log('Invalid response format'); + } + } catch (err) { + console.error('Failed to load directory structure', err); + } + }; + + if (docId) { + fetchDirectoryStructure(); + } + }, [docId, token, searchQuery]); + + const handleFileClick = (fileId: string, fileName: string) => { + setSelectedFile({ id: fileId, name: fileName }); + }; + + const navigateToDirectory = (_folderId: string, folderName: string) => { + setCurrentPath(prev => [...prev, folderName]); + }; + + const navigateUp = () => { + if (currentPath.length > 0) { + setCurrentPath(prev => prev.slice(0, -1)); + } + }; + + const getCurrentDirectory = (): ConnectorDirectoryStructure => { + return directoryStructure || {}; + }; + + const searchFiles = ( + query: string, + structure: ConnectorDirectoryStructure, + currentPath: string[] = [], + ): SearchResult[] => { + let results: SearchResult[] = []; + + Object.entries(structure).forEach(([name, node]) => { + const fullPath = [...currentPath, name].join('/'); + + if (name.toLowerCase().includes(query.toLowerCase())) { + results.push({ + name, + path: fullPath, + isFile: !!node.type, + id: node.id, + }); + } + + if (!node.type) { + // If it's a directory, search recursively + results = [ + ...results, + ...searchFiles(query, node as unknown as ConnectorDirectoryStructure, [ + ...currentPath, + name, + ]), + ]; + } + }); + + return results; + }; + + const handleSearchSelect = (result: SearchResult) => { + if (result.isFile) { + const pathParts = result.path.split('/'); + const fileName = pathParts.pop() || ''; + setCurrentPath(pathParts); + + setSelectedFile({ + id: result.id, + name: fileName, + }); + } else { + setCurrentPath(result.path.split('/')); + setSelectedFile(null); + } + setSearchQuery(''); + setSearchResults([]); + }; + + const handleBackNavigation = () => { + if (selectedFile) { + setSelectedFile(null); + } else if (currentPath.length === 0) { + if (onBackToDocuments) { + onBackToDocuments(); + } + } else { + navigateUp(); + } + }; + + const getMenuRef = (itemId: string) => { + if (!menuRefs.current[itemId]) { + menuRefs.current[itemId] = React.createRef(); + } + return menuRefs.current[itemId]; + }; + + const handleMenuClick = (e: React.MouseEvent, itemId: string) => { + e.preventDefault(); + e.stopPropagation(); + + if (activeMenuId === itemId) { + setActiveMenuId(null); + return; + } + setActiveMenuId(itemId); + }; + + const getActionOptions = ( + name: string, + id: string, + isFile: boolean, + _itemId: string, + ): MenuOption[] => { + const options: MenuOption[] = []; + + options.push({ + icon: EyeView, + label: t('settings.sources.view'), + onClick: (event: React.SyntheticEvent) => { + event.stopPropagation(); + if (isFile) { + handleFileClick(id, name); + } else { + navigateToDirectory(id, name); + } + }, + iconWidth: 18, + iconHeight: 18, + variant: 'primary', + }); + + // Remove delete option for connector files since they're not on our servers + // Connector files will be managed through the main Google Drive integration + + return options; + }; + + + + const currentDirectory = getCurrentDirectory(); + + const renderFileSearch = () => { + return ( +

+ { + setSearchQuery(e.target.value); + if (directoryStructure) { + setSearchResults(searchFiles(e.target.value, directoryStructure)); + } + }} + placeholder={t('settings.sources.searchFiles')} + className={`w-full h-[38px] border border-[#D1D9E0] px-4 py-2 dark:border-[#6A6A6A] + ${searchQuery ? 'rounded-t-[24px]' : 'rounded-[24px]'} + bg-transparent focus:outline-none dark:text-[#E0E0E0]`} + /> + + {searchQuery && ( +
+
+ {searchResults.length === 0 ? ( +
+ {t('settings.sources.noResults')} +
+ ) : ( + searchResults.map((result, index) => ( +
handleSearchSelect(result)} + title={result.path} + className={`flex min-w-0 cursor-pointer items-center px-3 py-2 hover:bg-[#ECEEEF] dark:hover:bg-[#27282D] ${index !== searchResults.length - 1 + ? 'border-b border-[#D1D9E0] dark:border-[#6A6A6A]' + : '' + }`} + > + { + + {result.path.split('/').pop() || result.path} + +
+ )) + )} +
+
+ )} +
+ ); + }; + + const renderConnectorFileTree = (structure: ConnectorDirectoryStructure): React.ReactNode[] => { + const entries = Object.entries(structure); + const directories = entries.filter(([_, node]) => node.isFolder); + const files = entries.filter(([_, node]) => !node.isFolder); + + return [ + ...directories.map(([name, node]) => { + const itemId = `dir-${node.id}`; + const menuRef = getMenuRef(itemId); + + return ( + navigateToDirectory(node.id, name)} + > + +
+ {t('settings.sources.folderAlt')} + + {name} + +
+ + + - + + + {node.modifiedTime || '-'} + + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, node.id, false, itemId)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: -4, y: 4 }} + /> +
+ + + ); + }), + ...files.map(([name, node]) => { + const itemId = `file-${node.id}`; + const menuRef = getMenuRef(itemId); + + return ( + handleFileClick(node.id, name)} + > + +
+ {t('settings.sources.fileAlt')} + + {name} + +
+ + + {node.token_count?.toLocaleString() || '-'} + + + {node.size || '-'} + + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, node.id, true, itemId)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: -4, y: 4 }} + /> +
+ + + ); + }), + ]; + }; + + const renderPathNavigation = () => { + return ( +
+ {/* Left side with path navigation */} +
+ + +
+ + {sourceName} + + {currentPath.length > 0 && ( + <> + / + {currentPath.map((dir, index) => ( + + + {dir} + + {index < currentPath.length - 1 && ( + + / + + )} + + ))} + + )} + {selectedFile && ( + <> + / + + {selectedFile.name} + + + )} +
+
+ + {/* Right side with search */} +
+ {renderFileSearch()} +
+
+ ); + }; + + return ( +
+ {selectedFile ? ( +
+
+ setSelectedFile(null)} + path={selectedFile.id} + /> +
+
+ ) : ( +
+
{renderPathNavigation()}
+ +
+
+ + + + + + + + + + + {renderConnectorFileTree(currentDirectory)} + +
+ {t('settings.sources.fileName')} + + {t('settings.sources.tokens')} + + {t('settings.sources.size')} + + + {t('settings.sources.actions')} + +
+
+
+
+ )} +
+ ); +}; + +export default ConnectorTree; From bb4ea76d309af7c89ce771e5ce05d1cca020e389 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Mon, 1 Sep 2025 12:04:58 +0530 Subject: [PATCH 17/25] (fix:connectorTree) path navigation fn --- ...torTree.tsx => ConnectorTreeComponent.tsx} | 677 ++++++++++-------- frontend/src/settings/Sources.tsx | 23 +- 2 files changed, 398 insertions(+), 302 deletions(-) rename frontend/src/components/{ConnectorTree.tsx => ConnectorTreeComponent.tsx} (65%) diff --git a/frontend/src/components/ConnectorTree.tsx b/frontend/src/components/ConnectorTreeComponent.tsx similarity index 65% rename from frontend/src/components/ConnectorTree.tsx rename to frontend/src/components/ConnectorTreeComponent.tsx index cee07aa4..a4258dfc 100644 --- a/frontend/src/components/ConnectorTree.tsx +++ b/frontend/src/components/ConnectorTreeComponent.tsx @@ -10,24 +10,21 @@ import FolderIcon from '../assets/folder.svg'; import ArrowLeft from '../assets/arrow-left.svg'; import ThreeDots from '../assets/three-dots.svg'; import EyeView from '../assets/eye-view.svg'; +import SearchIcon from '../assets/search.svg'; import { useOutsideAlerter } from '../hooks'; -interface ConnectorFileNode { - id: string; - name: string; - type: string; - size: string; - modifiedTime: string; +interface FileNode { + type?: string; token_count?: number; - mimeType?: string; - isFolder?: boolean; + size_bytes?: number; + [key: string]: any; } -interface ConnectorDirectoryStructure { - [key: string]: ConnectorFileNode; +interface DirectoryStructure { + [key: string]: FileNode; } -interface ConnectorTreeProps { +interface ConnectorTreeComponentProps { docId: string; sourceName: string; onBackToDocuments: () => void; @@ -37,21 +34,28 @@ interface SearchResult { name: string; path: string; isFile: boolean; - id: string; } -const ConnectorTree: React.FC = ({ +const ConnectorTreeComponent: React.FC = ({ docId, sourceName, onBackToDocuments, }) => { const { t } = useTranslation(); - const [directoryStructure, setDirectoryStructure] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [directoryStructure, setDirectoryStructure] = + useState(null); const [currentPath, setCurrentPath] = useState([]); const token = useSelector(selectToken); - const [selectedFile, setSelectedFile] = useState<{ id: string; name: string } | null>(null); const [activeMenuId, setActiveMenuId] = useState(null); - const menuRefs = useRef<{ [key: string]: React.RefObject }>({}); + const menuRefs = useRef<{ + [key: string]: React.RefObject; + }>({}); + const [selectedFile, setSelectedFile] = useState<{ + id: string; + name: string; + } | null>(null); const [searchQuery, setSearchQuery] = useState(''); const [searchResults, setSearchResults] = useState([]); const searchDropdownRef = useRef(null); @@ -66,69 +70,369 @@ const ConnectorTree: React.FC = ({ false, ); - + const handleFileClick = (fileName: string) => { + const fullPath = [...currentPath, fileName].join('/'); + setSelectedFile({ + id: fullPath, + name: fileName, + }); + }; useEffect(() => { const fetchDirectoryStructure = async () => { try { + setLoading(true); const response = await userService.getDirectoryStructure(docId, token); const data = await response.json(); if (data && data.directory_structure) { - const structure: ConnectorDirectoryStructure = {}; - // Convert the directory structure to our format - Object.entries(data.directory_structure).forEach(([key, value]: [string, any]) => { - structure[key] = { - id: key, - name: key, - type: value.type || 'file', - size: value.size_bytes ? `${value.size_bytes} bytes` : '-', - modifiedTime: '-', - token_count: value.token_count, - isFolder: !value.type, - }; - }); - setDirectoryStructure(structure); - - // Update search results when directory structure changes - if (searchQuery && structure) { - setSearchResults(searchFiles(searchQuery, structure)); - } + setDirectoryStructure(data.directory_structure); } else { - // Handle invalid response format - console.log('Invalid response format'); + setError('Invalid response format'); } } catch (err) { - console.error('Failed to load directory structure', err); + setError('Failed to load directory structure'); + console.error(err); + } finally { + setLoading(false); } }; if (docId) { fetchDirectoryStructure(); } - }, [docId, token, searchQuery]); + }, [docId, token]); - const handleFileClick = (fileId: string, fileName: string) => { - setSelectedFile({ id: fileId, name: fileName }); - }; - - const navigateToDirectory = (_folderId: string, folderName: string) => { - setCurrentPath(prev => [...prev, folderName]); + const navigateToDirectory = (dirName: string) => { + setCurrentPath([...currentPath, dirName]); }; const navigateUp = () => { - if (currentPath.length > 0) { - setCurrentPath(prev => prev.slice(0, -1)); + setCurrentPath(currentPath.slice(0, -1)); + }; + + const getCurrentDirectory = (): DirectoryStructure => { + if (!directoryStructure) return {}; + + let current = directoryStructure; + for (const dir of currentPath) { + if (current[dir] && !current[dir].type) { + current = current[dir] as DirectoryStructure; + } else { + return {}; + } + } + return current; + }; + + const formatBytes = (bytes: number): string => { + if (bytes === 0) return '0 Bytes'; + const k = 1024; + const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; + }; + + const getMenuRef = (id: string) => { + if (!menuRefs.current[id]) { + menuRefs.current[id] = React.createRef(); + } + return menuRefs.current[id]; + }; + + const handleMenuClick = ( + e: React.MouseEvent, + id: string, + ) => { + e.stopPropagation(); + setActiveMenuId(activeMenuId === id ? null : id); + }; + + const getActionOptions = ( + name: string, + isFile: boolean, + _itemId: string, + ): MenuOption[] => { + const options: MenuOption[] = []; + + options.push({ + icon: EyeView, + label: t('settings.sources.view'), + onClick: (event: React.SyntheticEvent) => { + event.stopPropagation(); + if (isFile) { + handleFileClick(name); + } else { + navigateToDirectory(name); + } + }, + iconWidth: 18, + iconHeight: 18, + variant: 'primary', + }); + + // No delete option for connector files + + return options; + }; + + const calculateDirectoryStats = ( + structure: DirectoryStructure, + ): { totalSize: number; totalTokens: number } => { + let totalSize = 0; + let totalTokens = 0; + + Object.entries(structure).forEach(([_, node]) => { + if (node.type) { + // It's a file + totalSize += node.size_bytes || 0; + totalTokens += node.token_count || 0; + } else { + // It's a directory, recurse + const stats = calculateDirectoryStats(node); + totalSize += stats.totalSize; + totalTokens += stats.totalTokens; + } + }); + + return { totalSize, totalTokens }; + }; + + const handleBackNavigation = () => { + if (selectedFile) { + setSelectedFile(null); + } else if (currentPath.length === 0) { + if (onBackToDocuments) { + onBackToDocuments(); + } + } else { + navigateUp(); } }; - const getCurrentDirectory = (): ConnectorDirectoryStructure => { - return directoryStructure || {}; + const renderPathNavigation = () => { + return ( +
+ {/* Left side with path navigation */} +
+ + +
+ + {sourceName} + + {currentPath.length > 0 && ( + <> + / + {currentPath.map((dir, index) => ( + + + {index < currentPath.length - 1 && ( + / + )} + + ))} + + )} +
+
+ +
+ {renderFileSearch()} +
+
+ ); + }; + + const renderFileTree = (directory: DirectoryStructure) => { + if (!directory) return []; + + // Create parent directory row + const parentRow = + currentPath.length > 0 + ? [ + + +
+ {t('settings.sources.parentFolderAlt')} + + .. + +
+ + + - + + + - + + + , + ] + : []; + + // Sort entries: directories first, then files, both alphabetically + const sortedEntries = Object.entries(directory).sort(([nameA, nodeA], [nameB, nodeB]) => { + const isFileA = !!nodeA.type; + const isFileB = !!nodeB.type; + + if (isFileA !== isFileB) { + return isFileA ? 1 : -1; // Directories first + } + + return nameA.localeCompare(nameB); // Alphabetical within each group + }); + + + // Process directories + const directoryRows = sortedEntries + .filter(([_, node]) => !node.type) + .map(([name, node]) => { + const itemId = `dir-${name}`; + const menuRef = getMenuRef(itemId); + + // Calculate directory stats + const dirStats = calculateDirectoryStats(node as DirectoryStructure); + + return ( + navigateToDirectory(name)} + > + +
+ {t('settings.sources.folderAlt')} + + {name} + +
+ + + {dirStats.totalTokens > 0 + ? dirStats.totalTokens.toLocaleString() + : '-'} + + + {dirStats.totalSize > 0 ? formatBytes(dirStats.totalSize) : '-'} + + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, false, itemId)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: -4, y: 4 }} + /> +
+ + + ); + }); + + // Process files + const fileRows = sortedEntries + .filter(([_, node]) => !!node.type) + .map(([name, node]) => { + const itemId = `file-${name}`; + const menuRef = getMenuRef(itemId); + + return ( + handleFileClick(name)} + > + +
+ {t('settings.sources.fileAlt')} + + {name} + +
+ + + {node.token_count?.toLocaleString() || '-'} + + + {node.size_bytes ? formatBytes(node.size_bytes) : '-'} + + +
+ + + setActiveMenuId(isOpen ? itemId : null) + } + options={getActionOptions(name, true, itemId)} + anchorRef={menuRef} + position="bottom-left" + offset={{ x: -4, y: 4 }} + /> +
+ + + ); + }); + + return [...parentRow, ...directoryRows, ...fileRows]; }; const searchFiles = ( query: string, - structure: ConnectorDirectoryStructure, + structure: DirectoryStructure, currentPath: string[] = [], ): SearchResult[] => { let results: SearchResult[] = []; @@ -141,7 +445,6 @@ const ConnectorTree: React.FC = ({ name, path: fullPath, isFile: !!node.type, - id: node.id, }); } @@ -149,7 +452,7 @@ const ConnectorTree: React.FC = ({ // If it's a directory, search recursively results = [ ...results, - ...searchFiles(query, node as unknown as ConnectorDirectoryStructure, [ + ...searchFiles(query, node as DirectoryStructure, [ ...currentPath, name, ]), @@ -167,7 +470,7 @@ const ConnectorTree: React.FC = ({ setCurrentPath(pathParts); setSelectedFile({ - id: result.id, + id: result.path, name: fileName, }); } else { @@ -178,70 +481,6 @@ const ConnectorTree: React.FC = ({ setSearchResults([]); }; - const handleBackNavigation = () => { - if (selectedFile) { - setSelectedFile(null); - } else if (currentPath.length === 0) { - if (onBackToDocuments) { - onBackToDocuments(); - } - } else { - navigateUp(); - } - }; - - const getMenuRef = (itemId: string) => { - if (!menuRefs.current[itemId]) { - menuRefs.current[itemId] = React.createRef(); - } - return menuRefs.current[itemId]; - }; - - const handleMenuClick = (e: React.MouseEvent, itemId: string) => { - e.preventDefault(); - e.stopPropagation(); - - if (activeMenuId === itemId) { - setActiveMenuId(null); - return; - } - setActiveMenuId(itemId); - }; - - const getActionOptions = ( - name: string, - id: string, - isFile: boolean, - _itemId: string, - ): MenuOption[] => { - const options: MenuOption[] = []; - - options.push({ - icon: EyeView, - label: t('settings.sources.view'), - onClick: (event: React.SyntheticEvent) => { - event.stopPropagation(); - if (isFile) { - handleFileClick(id, name); - } else { - navigateToDirectory(id, name); - } - }, - iconWidth: 18, - iconHeight: 18, - variant: 'primary', - }); - - // Remove delete option for connector files since they're not on our servers - // Connector files will be managed through the main Google Drive integration - - return options; - }; - - - - const currentDirectory = getCurrentDirectory(); - const renderFileSearch = () => { return (
@@ -255,8 +494,8 @@ const ConnectorTree: React.FC = ({ } }} placeholder={t('settings.sources.searchFiles')} - className={`w-full h-[38px] border border-[#D1D9E0] px-4 py-2 dark:border-[#6A6A6A] - ${searchQuery ? 'rounded-t-[24px]' : 'rounded-[24px]'} + className={`w-full h-[38px] border border-[#D1D9E0] px-4 py-2 dark:border-[#6A6A6A] + ${searchQuery ? 'rounded-t-[24px]' : 'rounded-[24px]'} bg-transparent focus:outline-none dark:text-[#E0E0E0]`} /> @@ -300,177 +539,27 @@ const ConnectorTree: React.FC = ({ ); }; - const renderConnectorFileTree = (structure: ConnectorDirectoryStructure): React.ReactNode[] => { - const entries = Object.entries(structure); - const directories = entries.filter(([_, node]) => node.isFolder); - const files = entries.filter(([_, node]) => !node.isFolder); - - return [ - ...directories.map(([name, node]) => { - const itemId = `dir-${node.id}`; - const menuRef = getMenuRef(itemId); - - return ( - navigateToDirectory(node.id, name)} - > - -
- {t('settings.sources.folderAlt')} - - {name} - -
- - - - - - - {node.modifiedTime || '-'} - - -
- - - setActiveMenuId(isOpen ? itemId : null) - } - options={getActionOptions(name, node.id, false, itemId)} - anchorRef={menuRef} - position="bottom-left" - offset={{ x: -4, y: 4 }} - /> -
- - - ); - }), - ...files.map(([name, node]) => { - const itemId = `file-${node.id}`; - const menuRef = getMenuRef(itemId); - - return ( - handleFileClick(node.id, name)} - > - -
- {t('settings.sources.fileAlt')} - - {name} - -
- - - {node.token_count?.toLocaleString() || '-'} - - - {node.size || '-'} - - -
- - - setActiveMenuId(isOpen ? itemId : null) - } - options={getActionOptions(name, node.id, true, itemId)} - anchorRef={menuRef} - position="bottom-left" - offset={{ x: -4, y: 4 }} - /> -
- - - ); - }), - ]; + const handleFileSearch = (searchQuery: string) => { + if (directoryStructure) { + return searchFiles(searchQuery, directoryStructure); + } + return []; }; - const renderPathNavigation = () => { - return ( -
- {/* Left side with path navigation */} -
- + const handleFileSelect = (path: string) => { + const pathParts = path.split('/'); + const fileName = pathParts.pop() || ''; + setCurrentPath(pathParts); + setSelectedFile({ + id: path, + name: fileName, + }); + }; -
- - {sourceName} - - {currentPath.length > 0 && ( - <> - / - {currentPath.map((dir, index) => ( - - - {dir} - - {index < currentPath.length - 1 && ( - - / - - )} - - ))} - - )} - {selectedFile && ( - <> - / - - {selectedFile.name} - - - )} -
-
+ const currentDirectory = getCurrentDirectory(); - {/* Right side with search */} -
- {renderFileSearch()} -
-
- ); + const navigateToPath = (index: number) => { + setCurrentPath(currentPath.slice(0, index + 1)); }; return ( @@ -483,6 +572,8 @@ const ConnectorTree: React.FC = ({ documentName={sourceName} handleGoBack={() => setSelectedFile(null)} path={selectedFile.id} + onFileSearch={handleFileSearch} + onFileSelect={handleFileSelect} />
@@ -504,15 +595,11 @@ const ConnectorTree: React.FC = ({ {t('settings.sources.size')} - - - {t('settings.sources.actions')} - - + - - {renderConnectorFileTree(currentDirectory)} + + {renderFileTree(getCurrentDirectory())}
@@ -523,4 +610,4 @@ const ConnectorTree: React.FC = ({ ); }; -export default ConnectorTree; +export default ConnectorTreeComponent; diff --git a/frontend/src/settings/Sources.tsx b/frontend/src/settings/Sources.tsx index e0473bb8..945bd16c 100644 --- a/frontend/src/settings/Sources.tsx +++ b/frontend/src/settings/Sources.tsx @@ -29,6 +29,7 @@ import { import Upload from '../upload/Upload'; import { formatDate } from '../utils/dateTimeUtils'; import FileTreeComponent from '../components/FileTreeComponent'; +import ConnectorTreeComponent from '../components/ConnectorTreeComponent'; import Chunks from '../components/Chunks'; const formatTokens = (tokens: number): string => { @@ -271,19 +272,27 @@ export default function Sources({ return documentToView ? (
- {documentToView.isNested ? ( - setDocumentToView(undefined)} /> ) : ( - setDocumentToView(undefined)} + setDocumentToView(undefined)} /> - )} + ) + ) : ( + setDocumentToView(undefined)} + /> + )}
) : (
From 384ad3e0ac245fc593cb4b61b0bb6b5d97ef037b Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 2 Sep 2025 13:34:31 +0530 Subject: [PATCH 18/25] (feat:connector) raw sync flow --- application/api/connector/routes.py | 115 +++++++++++++++++- application/api/user/tasks.py | 32 +++-- .../parser/connectors/google_drive/loader.py | 110 +++++++++++++++++ application/worker.py | 113 ++++++++++++++--- 4 files changed, 336 insertions(+), 34 deletions(-) diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py index 65a5d8c5..014fc71f 100644 --- a/application/api/connector/routes.py +++ b/application/api/connector/routes.py @@ -74,6 +74,7 @@ class UploadConnector(Resource): try: config = json.loads(data["data"]) source_data = None + sync_frequency = config.get("sync_frequency", "never") if data["source"] == "github": source_data = config.get("repo_url") @@ -112,7 +113,8 @@ class UploadConnector(Resource): file_ids=file_ids, folder_ids=folder_ids, recursive=config.get("recursive", False), - retriever=config.get("retriever", "classic") + retriever=config.get("retriever", "classic"), + sync_frequency=sync_frequency ) return make_response(jsonify({"success": True, "task_id": task.id}), 200) task = ingest_connector_task.delay( @@ -120,6 +122,7 @@ class UploadConnector(Resource): job_name=data["name"], user=decoded_token.get("sub"), loader=data["source"], + sync_frequency=sync_frequency ) except Exception as err: current_app.logger.error( @@ -511,3 +514,113 @@ class ConnectorDisconnect(Resource): return make_response(jsonify({"success": False, "error": str(e)}), 500) +@connectors_ns.route("/api/connectors/sync") +class ConnectorSync(Resource): + @api.expect( + api.model( + "ConnectorSyncModel", + { + "source_id": fields.String(required=True, description="Source ID to sync"), + "session_token": fields.String(required=True, description="Authentication token") + }, + ) + ) + @api.doc(description="Sync connector source to check for modifications") + def post(self): + decoded_token = request.decoded_token + if not decoded_token: + return make_response(jsonify({"success": False}), 401) + + try: + data = request.get_json() + source_id = data.get('source_id') + session_token = data.get('session_token') + + if not all([source_id, session_token]): + return make_response( + jsonify({ + "success": False, + "error": "source_id and session_token are required" + }), + 400 + ) + source = sources_collection.find_one({"_id": ObjectId(source_id)}) + if not source: + return make_response( + jsonify({ + "success": False, + "error": "Source not found" + }), + 404 + ) + + if source.get('user') != decoded_token.get('sub'): + return make_response( + jsonify({ + "success": False, + "error": "Unauthorized access to source" + }), + 403 + ) + + remote_data = {} + try: + if source.get('remote_data'): + remote_data = json.loads(source.get('remote_data')) + except json.JSONDecodeError: + current_app.logger.error(f"Invalid remote_data format for source {source_id}") + remote_data = {} + + source_type = remote_data.get('provider') + if not source_type: + return make_response( + jsonify({ + "success": False, + "error": "Source provider not found in remote_data" + }), + 400 + ) + + # Extract configuration from remote_data + file_ids = remote_data.get('file_ids', []) + folder_ids = remote_data.get('folder_ids', []) + recursive = remote_data.get('recursive', True) + + # Start the sync task + task = ingest_connector_task.delay( + job_name=source.get('name'), + user=decoded_token.get('sub'), + source_type=source_type, + session_token=session_token, + file_ids=file_ids, + folder_ids=folder_ids, + recursive=recursive, + retriever=source.get('retriever', 'classic'), + operation_mode="sync", + doc_id=source_id, + sync_frequency=source.get('sync_frequency', 'never') + ) + + return make_response( + jsonify({ + "success": True, + "task_id": task.id + }), + 200 + ) + + except Exception as err: + current_app.logger.error( + f"Error syncing connector source: {err}", + exc_info=True + ) + return make_response( + jsonify({ + "success": False, + "error": str(err) + }), + 400 + ) + + + diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index 833edbff..3519b701 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -49,27 +49,33 @@ def process_agent_webhook(self, agent_id, payload): @celery.task(bind=True) def ingest_connector_task( - self, - job_name, - user, - source_type, - session_token=None, - file_ids=None, - folder_ids=None, + self, + job_name, + user, + source_type, + session_token=None, + file_ids=None, + folder_ids=None, recursive=True, - retriever="classic" + retriever="classic", + operation_mode="upload", + doc_id=None, + sync_frequency="never" ): from application.worker import ingest_connector resp = ingest_connector( - self, - job_name, - user, - source_type, + self, + job_name, + user, + source_type, session_token=session_token, file_ids=file_ids, folder_ids=folder_ids, recursive=recursive, - retriever=retriever + retriever=retriever, + operation_mode=operation_mode, + doc_id=doc_id, + sync_frequency=sync_frequency ) return resp diff --git a/application/parser/connectors/google_drive/loader.py b/application/parser/connectors/google_drive/loader.py index a81ad4d4..06737748 100644 --- a/application/parser/connectors/google_drive/loader.py +++ b/application/parser/connectors/google_drive/loader.py @@ -146,6 +146,116 @@ class GoogleDriveLoader(BaseConnectorLoader): logging.error(f"Error loading data from Google Drive: {e}", exc_info=True) raise + def scan_drive_contents(self, file_ids: List[str], folder_ids: List[str], + modified_after: str = "2024-01-01T00:00:00Z") -> Dict[str, Any]: + """ + Scan Google Drive contents and check for files/folders modified after a specific date. + + Args: + file_ids: List of specific file IDs to check + folder_ids: List of folder IDs to scan for modified contents + modified_after: ISO 8601 formatted date string (default: "2024-01-01T00:00:00Z") + + Returns: + Dictionary containing: + - 'modified_files': List of file IDs that were modified after the given date + - 'modified_folders': List of folder IDs that were modified after the given date + - 'scan_summary': Summary of the scan results + """ + self._ensure_service() + + modified_files = [] + modified_folders = [] + + try: + for file_id in file_ids: + try: + file_metadata = self.service.files().get( + fileId=file_id, + fields='id,name,modifiedTime,mimeType' + ).execute() + + modified_time = file_metadata.get('modifiedTime', '') + if modified_time > modified_after: + modified_files.append({ + 'id': file_id, + 'name': file_metadata.get('name', 'Unknown'), + 'modifiedTime': modified_time, + 'mimeType': file_metadata.get('mimeType', '') + }) + + except Exception as e: + logging.warning(f"Error checking file {file_id}: {e}") + continue + + for folder_id in folder_ids: + try: + folder_metadata = self.service.files().get( + fileId=folder_id, + fields='id,name,modifiedTime,mimeType' + ).execute() + + folder_modified_time = folder_metadata.get('modifiedTime', '') + if folder_modified_time > modified_after: + modified_folders.append({ + 'id': folder_id, + 'name': folder_metadata.get('name', 'Unknown'), + 'modifiedTime': folder_modified_time, + 'mimeType': folder_metadata.get('mimeType', '') + }) + + query = f"'{folder_id}' in parents and modifiedTime > '{modified_after}'" + + page_token = None + while True: + results = self.service.files().list( + q=query, + spaces='drive', + fields='nextPageToken, files(id, name, modifiedTime, mimeType)', + pageToken=page_token + ).execute() + + items = results.get('files', []) + + for item in items: + item_info = { + 'id': item['id'], + 'name': item['name'], + 'modifiedTime': item['modifiedTime'], + 'mimeType': item['mimeType'] + } + + if item['mimeType'] == 'application/vnd.google-apps.folder': + modified_folders.append(item_info) + else: + modified_files.append(item_info) + + page_token = results.get('nextPageToken') + if not page_token: + break + + except Exception as e: + logging.warning(f"Error scanning folder {folder_id}: {e}") + continue + + summary = { + 'total_modified_files': len(modified_files), + 'total_modified_folders': len(modified_folders), + 'scan_date': modified_after + } + + logging.info(f"Drive scan completed: {summary['total_modified_files']} files and {summary['total_modified_folders']} folders modified after {modified_after}") + + return { + 'modified_files': modified_files, + 'modified_folders': modified_folders, + 'scan_summary': summary + } + + except Exception as e: + logging.error(f"Error scanning drive contents: {e}", exc_info=True) + raise + def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]: self._ensure_service() diff --git a/application/worker.py b/application/worker.py index e231474c..75519df6 100755 --- a/application/worker.py +++ b/application/worker.py @@ -650,8 +650,11 @@ def remote_worker( "id": str(id), "type": loader, "remote_data": source_data, - "sync_frequency": sync_frequency, + "sync_frequency": sync_frequency } + + if operation_mode == "sync": + file_data["last_sync"] = datetime.datetime.now() upload_index(full_path, file_data) except Exception as e: logging.error("Error in remote_worker task: %s", str(e), exc_info=True) @@ -708,7 +711,7 @@ def sync_worker(self, frequency): self, source_data, name, user, source_type, frequency, retriever, doc_id ) sync_counts["total_sync_count"] += 1 - sync_counts[ + sync_counts[ "sync_success" if resp["status"] == "success" else "sync_failure" ] += 1 return { @@ -745,7 +748,7 @@ def attachment_worker(self, file_info, user): input_files=[local_path], exclude_hidden=True, errors="ignore" ) .load_data()[0] - .text, + .text, ) @@ -839,15 +842,18 @@ def agent_webhook_worker(self, agent_id, payload): def ingest_connector( - self, - job_name: str, - user: str, + self, + job_name: str, + user: str, source_type: str, session_token=None, - file_ids=None, - folder_ids=None, + file_ids=None, + folder_ids=None, recursive=True, - retriever: str = "classic" + retriever: str = "classic", + operation_mode: str = "upload", + doc_id=None, + sync_frequency: str = "never", ) -> Dict[str, Any]: """ Ingestion for internal knowledge bases (GoogleDrive, etc.). @@ -861,6 +867,9 @@ def ingest_connector( folder_ids: List of folder IDs to download recursive: Whether to recursively download folders retriever: Type of retriever to use + operation_mode: "upload" for initial ingestion, "sync" for incremental sync + doc_id: Document ID for sync operations (required when operation_mode="sync") + sync_frequency: How often to sync ("never", "daily", "weekly", "monthly") """ logging.info(f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}") self.update_state(state="PROGRESS", meta={"current": 1}) @@ -869,25 +878,73 @@ def ingest_connector( try: # Step 1: Initialize the appropriate loader self.update_state(state="PROGRESS", meta={"current": 10, "status": "Initializing connector"}) - + + # Handle incremental sync using Google Drive API directly + current_sync_time = datetime.datetime.now().isoformat() + 'Z' + + if operation_mode == "sync": + if source_type == "google_drive": + from application.parser.connectors.connector_creator import ConnectorCreator + remote_loader = ConnectorCreator.create_connector("google_drive", session_token) + + source = sources_collection.find_one({"_id": ObjectId(doc_id)}) + + last_sync_time = source.get("last_sync") + if not last_sync_time: + last_sync_time = source.get("date") + + + scan_results = remote_loader.scan_drive_contents( + file_ids or [], + folder_ids or [], + modified_after=last_sync_time + ) + + modified_files = scan_results.get('modified_files', []) + modified_folders = scan_results.get('modified_folders', []) + + # Log atomic changes detected via Google Drive API + if modified_files: + logging.info(f"Files modified since last sync: {len(modified_files)} files") + for f in modified_files: + logging.info(f" - {f['name']} (ID: {f['id']}, Modified: {f['modifiedTime']})") + + if modified_folders: + logging.info(f"Folders modified since last sync: {len(modified_folders)} folders") + for f in modified_folders: + logging.info(f" - {f['name']} (ID: {f['id']}, Modified: {f['modifiedTime']})") + + if not modified_files and not modified_folders: + logging.info("No changes detected via Google Drive API") + return { + "user": user, + "name": job_name, + "tokens": 0, + "type": source_type, + "status": "no_changes" + } + + file_ids = [f['id'] for f in modified_files] + folder_ids = [f['id'] for f in modified_folders] + if source_type == "google_drive": if not session_token: raise ValueError("Google Drive connector requires session_token") from application.parser.connectors.connector_creator import ConnectorCreator remote_loader = ConnectorCreator.create_connector("google_drive", session_token) - + # Create a clean config for storage that excludes the session token api_source_config = { "file_ids": file_ids or [], "folder_ids": folder_ids or [], "recursive": recursive } - + # Step 2: Download files to temp directory self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) download_info = remote_loader.download_to_directory( - temp_dir, + temp_dir, { "file_ids": file_ids or [], "folder_ids": folder_ids or [], @@ -942,6 +999,8 @@ def ingest_connector( ) raw_docs = reader.load_data() directory_structure = getattr(reader, 'directory_structure', {}) + + # Step 4: Process documents (chunking, embedding, etc.) self.update_state(state="PROGRESS", meta={"current": 60, "status": "Processing documents"}) @@ -964,8 +1023,16 @@ def ingest_connector( docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - # Step 5: Store in vector database - id = ObjectId() + if operation_mode == "upload": + id = ObjectId() + elif operation_mode == "sync": + if not doc_id or not ObjectId.is_valid(doc_id): + logging.error("Invalid doc_id provided for sync operation: %s", doc_id) + raise ValueError("doc_id must be provided for sync operation.") + id = ObjectId(doc_id) + else: + raise ValueError(f"Invalid operation_mode: {operation_mode}") + vector_store_path = os.path.join(temp_dir, "vector_store") os.makedirs(vector_store_path, exist_ok=True) @@ -986,16 +1053,22 @@ def ingest_connector( "provider": source_type, **api_source_config }), - "directory_structure": json.dumps(directory_structure) + "directory_structure": json.dumps(directory_structure), + "sync_frequency": sync_frequency } - + + if operation_mode == "sync": + file_data["last_sync"] = datetime.datetime.now() + else: + file_data["last_sync"] = datetime.datetime.now() + upload_index(vector_store_path, file_data) - + # Ensure we mark the task as complete self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"}) - + logging.info(f"Remote ingestion completed: {job_name}") - + return { "user": user, "name": job_name, From c2c18e8319fb06d7649897a84ee701dbd2d2d644 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 2 Sep 2025 13:36:41 +0530 Subject: [PATCH 19/25] (feat:connector,fe) sync api, notification --- frontend/src/api/endpoints.ts | 1 + frontend/src/api/services/userService.ts | 12 ++++++++++++ frontend/src/components/ConnectorTreeComponent.tsx | 1 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/frontend/src/api/endpoints.ts b/frontend/src/api/endpoints.ts index 81d19c87..955f43ee 100644 --- a/frontend/src/api/endpoints.ts +++ b/frontend/src/api/endpoints.ts @@ -38,6 +38,7 @@ const endpoints = { UPDATE_TOOL_STATUS: '/api/update_tool_status', UPDATE_TOOL: '/api/update_tool', DELETE_TOOL: '/api/delete_tool', + SYNC_CONNECTOR: '/api/connectors/sync', GET_CHUNKS: ( docId: string, page: number, diff --git a/frontend/src/api/services/userService.ts b/frontend/src/api/services/userService.ts index af5e4f22..7d365b3d 100644 --- a/frontend/src/api/services/userService.ts +++ b/frontend/src/api/services/userService.ts @@ -104,6 +104,18 @@ const userService = { apiClient.get(endpoints.USER.DIRECTORY_STRUCTURE(docId), token), manageSourceFiles: (data: FormData, token: string | null): Promise => apiClient.postFormData(endpoints.USER.MANAGE_SOURCE_FILES, data, token), + syncConnector: (docId: string, token: string | null): Promise => { + const sessionToken = localStorage.getItem('google_drive_session_token'); + return apiClient.post( + endpoints.USER.SYNC_CONNECTOR, + { + source_id: docId, + session_token: sessionToken, + provider: 'google_drive' + }, + token + ); + }, }; export default userService; diff --git a/frontend/src/components/ConnectorTreeComponent.tsx b/frontend/src/components/ConnectorTreeComponent.tsx index a4258dfc..96e235c6 100644 --- a/frontend/src/components/ConnectorTreeComponent.tsx +++ b/frontend/src/components/ConnectorTreeComponent.tsx @@ -171,7 +171,6 @@ const ConnectorTreeComponent: React.FC = ({ variant: 'primary', }); - // No delete option for connector files return options; }; From f9b2c9569513c180515ed301c10f7bc3e20668e4 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 2 Sep 2025 18:06:04 +0530 Subject: [PATCH 20/25] (feat:connector) sync, simply re-ingest --- .../parser/connectors/google_drive/loader.py | 108 ------------------ application/worker.py | 106 +++-------------- .../src/components/ConnectorTreeComponent.tsx | 89 ++++++++++++++- frontend/src/locale/en.json | 1 + frontend/src/locale/es.json | 1 + frontend/src/locale/jp.json | 1 + frontend/src/locale/ru.json | 1 + frontend/src/locale/zh-TW.json | 1 + frontend/src/locale/zh.json | 1 + 9 files changed, 112 insertions(+), 197 deletions(-) diff --git a/application/parser/connectors/google_drive/loader.py b/application/parser/connectors/google_drive/loader.py index 06737748..22d6acc9 100644 --- a/application/parser/connectors/google_drive/loader.py +++ b/application/parser/connectors/google_drive/loader.py @@ -146,115 +146,7 @@ class GoogleDriveLoader(BaseConnectorLoader): logging.error(f"Error loading data from Google Drive: {e}", exc_info=True) raise - def scan_drive_contents(self, file_ids: List[str], folder_ids: List[str], - modified_after: str = "2024-01-01T00:00:00Z") -> Dict[str, Any]: - """ - Scan Google Drive contents and check for files/folders modified after a specific date. - Args: - file_ids: List of specific file IDs to check - folder_ids: List of folder IDs to scan for modified contents - modified_after: ISO 8601 formatted date string (default: "2024-01-01T00:00:00Z") - - Returns: - Dictionary containing: - - 'modified_files': List of file IDs that were modified after the given date - - 'modified_folders': List of folder IDs that were modified after the given date - - 'scan_summary': Summary of the scan results - """ - self._ensure_service() - - modified_files = [] - modified_folders = [] - - try: - for file_id in file_ids: - try: - file_metadata = self.service.files().get( - fileId=file_id, - fields='id,name,modifiedTime,mimeType' - ).execute() - - modified_time = file_metadata.get('modifiedTime', '') - if modified_time > modified_after: - modified_files.append({ - 'id': file_id, - 'name': file_metadata.get('name', 'Unknown'), - 'modifiedTime': modified_time, - 'mimeType': file_metadata.get('mimeType', '') - }) - - except Exception as e: - logging.warning(f"Error checking file {file_id}: {e}") - continue - - for folder_id in folder_ids: - try: - folder_metadata = self.service.files().get( - fileId=folder_id, - fields='id,name,modifiedTime,mimeType' - ).execute() - - folder_modified_time = folder_metadata.get('modifiedTime', '') - if folder_modified_time > modified_after: - modified_folders.append({ - 'id': folder_id, - 'name': folder_metadata.get('name', 'Unknown'), - 'modifiedTime': folder_modified_time, - 'mimeType': folder_metadata.get('mimeType', '') - }) - - query = f"'{folder_id}' in parents and modifiedTime > '{modified_after}'" - - page_token = None - while True: - results = self.service.files().list( - q=query, - spaces='drive', - fields='nextPageToken, files(id, name, modifiedTime, mimeType)', - pageToken=page_token - ).execute() - - items = results.get('files', []) - - for item in items: - item_info = { - 'id': item['id'], - 'name': item['name'], - 'modifiedTime': item['modifiedTime'], - 'mimeType': item['mimeType'] - } - - if item['mimeType'] == 'application/vnd.google-apps.folder': - modified_folders.append(item_info) - else: - modified_files.append(item_info) - - page_token = results.get('nextPageToken') - if not page_token: - break - - except Exception as e: - logging.warning(f"Error scanning folder {folder_id}: {e}") - continue - - summary = { - 'total_modified_files': len(modified_files), - 'total_modified_folders': len(modified_folders), - 'scan_date': modified_after - } - - logging.info(f"Drive scan completed: {summary['total_modified_files']} files and {summary['total_modified_folders']} folders modified after {modified_after}") - - return { - 'modified_files': modified_files, - 'modified_folders': modified_folders, - 'scan_summary': summary - } - - except Exception as e: - logging.error(f"Error scanning drive contents: {e}", exc_info=True) - raise def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]: self._ensure_service() diff --git a/application/worker.py b/application/worker.py index 75519df6..10fb6c2b 100755 --- a/application/worker.py +++ b/application/worker.py @@ -22,6 +22,7 @@ from application.api.answer.services.stream_processor import get_prompt from application.core.mongo_db import MongoDB from application.core.settings import settings from application.parser.chunking import Chunker +from application.parser.connectors.connector_creator import ConnectorCreator from application.parser.embedding_pipeline import embed_and_store_documents from application.parser.file.bulk import SimpleDirectoryReader from application.parser.remote.remote_creator import RemoteCreator @@ -879,98 +880,27 @@ def ingest_connector( # Step 1: Initialize the appropriate loader self.update_state(state="PROGRESS", meta={"current": 10, "status": "Initializing connector"}) - # Handle incremental sync using Google Drive API directly - current_sync_time = datetime.datetime.now().isoformat() + 'Z' + if not session_token: + raise ValueError(f"{source_type} connector requires session_token") - if operation_mode == "sync": - if source_type == "google_drive": - from application.parser.connectors.connector_creator import ConnectorCreator - remote_loader = ConnectorCreator.create_connector("google_drive", session_token) + if not ConnectorCreator.is_supported(source_type): + raise ValueError(f"Unsupported connector type: {source_type}. Supported types: {ConnectorCreator.get_supported_connectors()}") - source = sources_collection.find_one({"_id": ObjectId(doc_id)}) - - last_sync_time = source.get("last_sync") - if not last_sync_time: - last_sync_time = source.get("date") - + remote_loader = ConnectorCreator.create_connector(source_type, session_token) - scan_results = remote_loader.scan_drive_contents( - file_ids or [], - folder_ids or [], - modified_after=last_sync_time - ) + # Create a clean config for storage + api_source_config = { + "file_ids": file_ids or [], + "folder_ids": folder_ids or [], + "recursive": recursive + } - modified_files = scan_results.get('modified_files', []) - modified_folders = scan_results.get('modified_folders', []) - - # Log atomic changes detected via Google Drive API - if modified_files: - logging.info(f"Files modified since last sync: {len(modified_files)} files") - for f in modified_files: - logging.info(f" - {f['name']} (ID: {f['id']}, Modified: {f['modifiedTime']})") - - if modified_folders: - logging.info(f"Folders modified since last sync: {len(modified_folders)} folders") - for f in modified_folders: - logging.info(f" - {f['name']} (ID: {f['id']}, Modified: {f['modifiedTime']})") - - if not modified_files and not modified_folders: - logging.info("No changes detected via Google Drive API") - return { - "user": user, - "name": job_name, - "tokens": 0, - "type": source_type, - "status": "no_changes" - } - - file_ids = [f['id'] for f in modified_files] - folder_ids = [f['id'] for f in modified_folders] - - if source_type == "google_drive": - if not session_token: - raise ValueError("Google Drive connector requires session_token") - - from application.parser.connectors.connector_creator import ConnectorCreator - remote_loader = ConnectorCreator.create_connector("google_drive", session_token) - - # Create a clean config for storage that excludes the session token - api_source_config = { - "file_ids": file_ids or [], - "folder_ids": folder_ids or [], - "recursive": recursive - } - - # Step 2: Download files to temp directory - self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) - download_info = remote_loader.download_to_directory( - temp_dir, - { - "file_ids": file_ids or [], - "folder_ids": folder_ids or [], - "recursive": recursive - } - ) - else: - # For other external knowledge base connectors (future: dropbox, onedrive, etc.) - from application.parser.connectors.connector_creator import ConnectorCreator - - if not ConnectorCreator.is_supported(source_type): - raise ValueError(f"Unsupported connector type: {source_type}. Supported types: {ConnectorCreator.get_supported_connectors()}") - - # Create connector with session token and other parameters - remote_loader = ConnectorCreator.create_connector(source_type, session_token) - - api_source_config = { - "file_ids": file_ids or [], - "folder_ids": folder_ids or [], - "recursive": recursive - } - - download_info = remote_loader.download_to_directory( - temp_dir, - api_source_config - ) + # Step 2: Download files to temp directory + self.update_state(state="PROGRESS", meta={"current": 20, "status": "Downloading files"}) + download_info = remote_loader.download_to_directory( + temp_dir, + api_source_config + ) if download_info.get("empty_result", False) or not download_info.get("files_downloaded", 0): logging.warning(f"No files were downloaded from {source_type}") diff --git a/frontend/src/components/ConnectorTreeComponent.tsx b/frontend/src/components/ConnectorTreeComponent.tsx index 96e235c6..9e8ccdf9 100644 --- a/frontend/src/components/ConnectorTreeComponent.tsx +++ b/frontend/src/components/ConnectorTreeComponent.tsx @@ -10,7 +10,7 @@ import FolderIcon from '../assets/folder.svg'; import ArrowLeft from '../assets/arrow-left.svg'; import ThreeDots from '../assets/three-dots.svg'; import EyeView from '../assets/eye-view.svg'; -import SearchIcon from '../assets/search.svg'; +import SyncIcon from '../assets/sync.svg'; import { useOutsideAlerter } from '../hooks'; interface FileNode { @@ -59,6 +59,8 @@ const ConnectorTreeComponent: React.FC = ({ const [searchQuery, setSearchQuery] = useState(''); const [searchResults, setSearchResults] = useState([]); const searchDropdownRef = useRef(null); + const [isSyncing, setIsSyncing] = useState(false); + const [syncProgress, setSyncProgress] = useState(0); useOutsideAlerter( searchDropdownRef, @@ -78,6 +80,71 @@ const ConnectorTreeComponent: React.FC = ({ }); }; + const handleSync = async () => { + if (isSyncing) return; + + setIsSyncing(true); + setSyncProgress(0); + + try { + const response = await userService.syncConnector(docId, token); + const data = await response.json(); + + if (data.success) { + console.log('Sync started successfully:', data.task_id); + setSyncProgress(10); + + // Poll task status using userService + const maxAttempts = 30; + const pollInterval = 2000; + + for (let attempt = 0; attempt < maxAttempts; attempt++) { + try { + const statusResponse = await userService.getTaskStatus(data.task_id, token); + const statusData = await statusResponse.json(); + + console.log(`Task status (attempt ${attempt + 1}):`, statusData.status); + + if (statusData.status === 'SUCCESS') { + setSyncProgress(100); + console.log('Sync completed successfully'); + + // Refresh directory structure + try { + const refreshResponse = await userService.getDirectoryStructure(docId, token); + const refreshData = await refreshResponse.json(); + if (refreshData && refreshData.directory_structure) { + setDirectoryStructure(refreshData.directory_structure); + } + } catch (err) { + console.error('Error refreshing directory structure:', err); + } + break; + } else if (statusData.status === 'FAILURE') { + console.error('Sync task failed:', statusData.result); + break; + } else if (statusData.status === 'PROGRESS') { + const progress = statusData.meta?.current || 0; + setSyncProgress(Math.max(10, progress)); // Ensure minimum 10% after start + } + + await new Promise((resolve) => setTimeout(resolve, pollInterval)); + } catch (error) { + console.error('Error polling task status:', error); + break; + } + } + } else { + console.error('Sync failed:', data.error); + } + } catch (err) { + console.error('Error syncing connector:', err); + } finally { + setIsSyncing(false); + setSyncProgress(0); + } + }; + useEffect(() => { const fetchDirectoryStructure = async () => { try { @@ -247,7 +314,27 @@ const ConnectorTreeComponent: React.FC = ({
+ {renderFileSearch()} + + {/* Sync button */} +
); diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index d0d1b4b3..39e2bee7 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -67,6 +67,7 @@ "preLoaded": "Pre-loaded", "private": "Private", "sync": "Sync", + "syncing": "Syncing...", "syncFrequency": { "never": "Never", "daily": "Daily", diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index 64e204dd..1c8afa6c 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -67,6 +67,7 @@ "preLoaded": "Precargado", "private": "Privado", "sync": "Sincronizar", + "syncing": "Sincronizando...", "syncFrequency": { "never": "Nunca", "daily": "Diario", diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index 5b93d182..ef29bd78 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -67,6 +67,7 @@ "preLoaded": "プリロード済み", "private": "プライベート", "sync": "同期", + "syncing": "同期中...", "syncFrequency": { "never": "なし", "daily": "毎日", diff --git a/frontend/src/locale/ru.json b/frontend/src/locale/ru.json index 6a83f9a3..a8506451 100644 --- a/frontend/src/locale/ru.json +++ b/frontend/src/locale/ru.json @@ -67,6 +67,7 @@ "preLoaded": "Предзагруженный", "private": "Частный", "sync": "Синхронизация", + "syncing": "Синхронизация...", "syncFrequency": { "never": "Никогда", "daily": "Ежедневно", diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json index 8c734ff3..4f9c623b 100644 --- a/frontend/src/locale/zh-TW.json +++ b/frontend/src/locale/zh-TW.json @@ -67,6 +67,7 @@ "preLoaded": "預載入", "private": "私人", "sync": "同步", + "syncing": "同步中...", "syncFrequency": { "never": "從不", "daily": "每天", diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index eb216c86..014e8256 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -67,6 +67,7 @@ "preLoaded": "预加载", "private": "私有", "sync": "同步", + "syncing": "同步中...", "syncFrequency": { "never": "从不", "daily": "每天", From 3b3a04a2499e61f41c832bb80980470b32913338 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 2 Sep 2025 20:28:23 +0530 Subject: [PATCH 21/25] (feat:connector) sync fixes UI, minor refactor --- application/api/user/routes.py | 14 ++- frontend/src/api/services/userService.ts | 11 +-- .../src/components/ConnectorTreeComponent.tsx | 86 ++++++++++++------- frontend/src/upload/Upload.tsx | 21 +++-- frontend/src/utils/providerUtils.ts | 17 ++++ 5 files changed, 102 insertions(+), 47 deletions(-) create mode 100644 frontend/src/utils/providerUtils.ts diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 15024545..7eae66f6 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -3965,12 +3965,24 @@ class DirectoryStructure(Resource): ) directory_structure = doc.get("directory_structure", {}) + base_path = doc.get("file_path", "") + + provider = None + remote_data = doc.get("remote_data") + try: + if isinstance(remote_data, str) and remote_data: + remote_data_obj = json.loads(remote_data) + provider = remote_data_obj.get("provider") + except Exception as e: + current_app.logger.warning( + f"Failed to parse remote_data for doc {doc_id}: {e}") return make_response( jsonify({ "success": True, "directory_structure": directory_structure, - "base_path": doc.get("file_path", "") + "base_path": base_path, + "provider": provider, }), 200 ) diff --git a/frontend/src/api/services/userService.ts b/frontend/src/api/services/userService.ts index 7d365b3d..6e375951 100644 --- a/frontend/src/api/services/userService.ts +++ b/frontend/src/api/services/userService.ts @@ -1,5 +1,6 @@ import apiClient from '../client'; import endpoints from '../endpoints'; +import { getSessionToken } from '../../utils/providerUtils'; const userService = { getConfig: (): Promise => apiClient.get(endpoints.USER.CONFIG, null), @@ -104,14 +105,14 @@ const userService = { apiClient.get(endpoints.USER.DIRECTORY_STRUCTURE(docId), token), manageSourceFiles: (data: FormData, token: string | null): Promise => apiClient.postFormData(endpoints.USER.MANAGE_SOURCE_FILES, data, token), - syncConnector: (docId: string, token: string | null): Promise => { - const sessionToken = localStorage.getItem('google_drive_session_token'); + syncConnector: (docId: string, provider: string, token: string | null): Promise => { + const sessionToken = getSessionToken(provider); return apiClient.post( endpoints.USER.SYNC_CONNECTOR, - { - source_id: docId, + { + source_id: docId, session_token: sessionToken, - provider: 'google_drive' + provider: provider }, token ); diff --git a/frontend/src/components/ConnectorTreeComponent.tsx b/frontend/src/components/ConnectorTreeComponent.tsx index 9e8ccdf9..972c55f7 100644 --- a/frontend/src/components/ConnectorTreeComponent.tsx +++ b/frontend/src/components/ConnectorTreeComponent.tsx @@ -61,6 +61,9 @@ const ConnectorTreeComponent: React.FC = ({ const searchDropdownRef = useRef(null); const [isSyncing, setIsSyncing] = useState(false); const [syncProgress, setSyncProgress] = useState(0); + const [sourceProvider, setSourceProvider] = useState(''); + const [syncDone, setSyncDone] = useState(false); + useOutsideAlerter( searchDropdownRef, @@ -81,13 +84,16 @@ const ConnectorTreeComponent: React.FC = ({ }; const handleSync = async () => { + if (isSyncing) return; + const provider = sourceProvider; + setIsSyncing(true); setSyncProgress(0); try { - const response = await userService.syncConnector(docId, token); + const response = await userService.syncConnector(docId, provider, token); const data = await response.json(); if (data.success) { @@ -115,7 +121,14 @@ const ConnectorTreeComponent: React.FC = ({ const refreshData = await refreshResponse.json(); if (refreshData && refreshData.directory_structure) { setDirectoryStructure(refreshData.directory_structure); + setCurrentPath([]); } + if (refreshData && refreshData.provider) { + setSourceProvider(refreshData.provider); + } + + setSyncDone(true); + setTimeout(() => setSyncDone(false), 5000); } catch (err) { console.error('Error refreshing directory structure:', err); } @@ -124,8 +137,13 @@ const ConnectorTreeComponent: React.FC = ({ console.error('Sync task failed:', statusData.result); break; } else if (statusData.status === 'PROGRESS') { - const progress = statusData.meta?.current || 0; - setSyncProgress(Math.max(10, progress)); // Ensure minimum 10% after start + + const progress = Number((statusData.result && statusData.result.current != null) + ? statusData.result.current + : (statusData.meta && statusData.meta.current != null) + ? statusData.meta.current + : 0); + setSyncProgress(Math.max(10, progress)); } await new Promise((resolve) => setTimeout(resolve, pollInterval)); @@ -149,16 +167,21 @@ const ConnectorTreeComponent: React.FC = ({ const fetchDirectoryStructure = async () => { try { setLoading(true); - const response = await userService.getDirectoryStructure(docId, token); - const data = await response.json(); - if (data && data.directory_structure) { - setDirectoryStructure(data.directory_structure); + const directoryResponse = await userService.getDirectoryStructure(docId, token); + const directoryData = await directoryResponse.json(); + + if (directoryData && directoryData.directory_structure) { + setDirectoryStructure(directoryData.directory_structure); } else { setError('Invalid response format'); } + + if (directoryData && directoryData.provider) { + setSourceProvider(directoryData.provider); + } } catch (err) { - setError('Failed to load directory structure'); + setError('Failed to load source information'); console.error(err); } finally { setLoading(false); @@ -247,7 +270,7 @@ const ConnectorTreeComponent: React.FC = ({ ): { totalSize: number; totalTokens: number } => { let totalSize = 0; let totalTokens = 0; - + Object.entries(structure).forEach(([_, node]) => { if (node.type) { // It's a file @@ -260,10 +283,10 @@ const ConnectorTreeComponent: React.FC = ({ totalTokens += stats.totalTokens; } }); - + return { totalSize, totalTokens }; }; - + const handleBackNavigation = () => { if (selectedFile) { setSelectedFile(null); @@ -287,24 +310,21 @@ const ConnectorTreeComponent: React.FC = ({ > left-arrow - -
- + +
+ {sourceName} {currentPath.length > 0 && ( <> - / + / {currentPath.map((dir, index) => ( - + {index < currentPath.length - 1 && ( - / + / )} ))} @@ -326,14 +346,16 @@ const ConnectorTreeComponent: React.FC = ({ ? 'bg-gray-300 text-gray-600 cursor-not-allowed dark:bg-gray-600 dark:text-gray-400' : 'bg-purple-30 hover:bg-violets-are-blue text-white' }`} - title={isSyncing ? `${t('settings.sources.syncing')} ${syncProgress}%` : t('settings.sources.sync')} + title={isSyncing + ? `${t('settings.sources.syncing')} ${syncProgress}%` + : (syncDone ? 'Done' : t('settings.sources.sync'))} > {t('settings.sources.sync')} - {isSyncing ? `${syncProgress}%` : t('settings.sources.sync')} + {isSyncing ? `${syncProgress}%` : (syncDone ? 'Done' : t('settings.sources.sync'))}
@@ -379,25 +401,25 @@ const ConnectorTreeComponent: React.FC = ({ const sortedEntries = Object.entries(directory).sort(([nameA, nodeA], [nameB, nodeB]) => { const isFileA = !!nodeA.type; const isFileB = !!nodeB.type; - + if (isFileA !== isFileB) { return isFileA ? 1 : -1; // Directories first } - + return nameA.localeCompare(nameB); // Alphabetical within each group }); - + // Process directories const directoryRows = sortedEntries .filter(([_, node]) => !node.type) .map(([name, node]) => { const itemId = `dir-${name}`; const menuRef = getMenuRef(itemId); - + // Calculate directory stats const dirStats = calculateDirectoryStats(node as DirectoryStructure); - + return ( = ({ .map(([name, node]) => { const itemId = `file-${name}`; const menuRef = getMenuRef(itemId); - + return ( = ({ } }} placeholder={t('settings.sources.searchFiles')} - className={`w-full h-[38px] border border-[#D1D9E0] px-4 py-2 dark:border-[#6A6A6A] - ${searchQuery ? 'rounded-t-[24px]' : 'rounded-[24px]'} + className={`w-full h-[38px] border border-[#D1D9E0] px-4 py-2 dark:border-[#6A6A6A] + ${searchQuery ? 'rounded-t-[24px]' : 'rounded-[24px]'} bg-transparent focus:outline-none dark:text-[#E0E0E0]`} /> diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 8020a0d5..c780e68c 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -4,6 +4,7 @@ import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; import userService from '../api/services/userService'; +import { getSessionToken, setSessionToken, removeSessionToken } from '../utils/providerUtils'; import FileUpload from '../assets/file_upload.svg'; import WebsiteCollect from '../assets/website_collect.svg'; import Dropdown from '../components/Dropdown'; @@ -62,6 +63,8 @@ function Upload({ const [currentFolderId, setCurrentFolderId] = useState(null); const [folderPath, setFolderPath] = useState>([{id: null, name: 'My Drive'}]); + + const renderFormFields = () => { const schema = IngestorFormSchemas[ingestor.type]; if (!schema) return null; @@ -445,7 +448,7 @@ function Upload({ let configData; if (ingestor.type === 'google_drive') { - const sessionToken = localStorage.getItem('google_drive_session_token'); + const sessionToken = getSessionToken(ingestor.type); const selectedItems = googleDriveFiles.filter(file => selectedFiles.includes(file.id)); const selectedFolderIds = selectedItems @@ -497,7 +500,7 @@ function Upload({ useEffect(() => { if (ingestor.type === 'google_drive') { - const sessionToken = localStorage.getItem('google_drive_session_token'); + const sessionToken = getSessionToken(ingestor.type); if (sessionToken) { // Auto-authenticate if session token exists @@ -524,7 +527,7 @@ function Upload({ }); if (!validateResponse.ok) { - localStorage.removeItem('google_drive_session_token'); + removeSessionToken(ingestor.type); setIsGoogleDriveConnected(false); setAuthError('Session expired. Please reconnect to Google Drive.'); return; @@ -536,7 +539,7 @@ function Upload({ setUserEmail(validateData.user_email || 'Connected User'); loadGoogleDriveFiles(sessionToken, null); } else { - localStorage.removeItem('google_drive_session_token'); + removeSessionToken(ingestor.type); setIsGoogleDriveConnected(false); setAuthError(validateData.error || 'Session expired. Please reconnect your Google Drive account and make sure to grant offline access.'); } @@ -640,7 +643,7 @@ function Upload({ }; const handleFolderClick = (folderId: string, folderName: string) => { - const sessionToken = localStorage.getItem('google_drive_session_token'); + const sessionToken = getSessionToken(ingestor.type); if (sessionToken) { setCurrentFolderId(folderId); setFolderPath(prev => [...prev, {id: folderId, name: folderName}]); @@ -649,7 +652,7 @@ function Upload({ }; const navigateBack = (index: number) => { - const sessionToken = localStorage.getItem('google_drive_session_token'); + const sessionToken = getSessionToken(ingestor.type); if (sessionToken) { const newPath = folderPath.slice(0, index + 1); const targetFolderId = newPath[newPath.length - 1]?.id; @@ -894,7 +897,7 @@ function Upload({ setAuthError(''); if (data.session_token) { - localStorage.setItem('google_drive_session_token', data.session_token); + setSessionToken(ingestor.type, data.session_token); loadGoogleDriveFiles(data.session_token, null); } }} @@ -916,7 +919,7 @@ function Upload({
- + {sourceName} {currentPath.length > 0 && ( @@ -324,7 +334,9 @@ const ConnectorTreeComponent: React.FC = ({ {dir} {index < currentPath.length - 1 && ( - / + + / + )} ))} @@ -333,29 +345,36 @@ const ConnectorTreeComponent: React.FC = ({
-
- +
{renderFileSearch()} {/* Sync button */}
@@ -369,46 +388,47 @@ const ConnectorTreeComponent: React.FC = ({ const parentRow = currentPath.length > 0 ? [ - - -
- {t('settings.sources.parentFolderAlt')} - - .. - -
- - - - - - - - - - - , - ] + + +
+ {t('settings.sources.parentFolderAlt')} + + .. + +
+ + + - + + + - + + + , + ] : []; // Sort entries: directories first, then files, both alphabetically - const sortedEntries = Object.entries(directory).sort(([nameA, nodeA], [nameB, nodeB]) => { - const isFileA = !!nodeA.type; - const isFileB = !!nodeB.type; + const sortedEntries = Object.entries(directory).sort( + ([nameA, nodeA], [nameB, nodeB]) => { + const isFileA = !!nodeA.type; + const isFileB = !!nodeB.type; - if (isFileA !== isFileB) { - return isFileA ? 1 : -1; // Directories first - } - - return nameA.localeCompare(nameB); // Alphabetical within each group - }); + if (isFileA !== isFileB) { + return isFileA ? 1 : -1; // Directories first + } + return nameA.localeCompare(nameB); // Alphabetical within each group + }, + ); // Process directories const directoryRows = sortedEntries @@ -450,7 +470,7 @@ const ConnectorTreeComponent: React.FC = ({
-
- {isLoadingFiles ? ( +
+ {isLoadingFiles && googleDriveFiles.length === 0 ? (
@@ -995,60 +990,96 @@ function Upload({ No files found in your Google Drive
) : ( -
- {googleDriveFiles.map((file) => ( -
-
-
- handleFileSelect(file.id)} - className="h-4 w-4 text-blue-600 rounded border-gray-300 focus:ring-blue-500" - /> -
- {file.type === 'application/vnd.google-apps.folder' || file.isFolder ? ( -
handleFolderClick(file.id, file.name)} - > - Folder + <> +
+ {googleDriveFiles.map((file) => ( +
+
+
+ handleFileSelect(file.id)} + className="h-4 w-4 text-blue-600 rounded border-gray-300 focus:ring-blue-500" + />
- ) : ( -
- File + {file.type === 'application/vnd.google-apps.folder' || file.isFolder ? ( +
handleFolderClick(file.id, file.name)} + > + Folder +
+ ) : ( +
+ File +
+ )} +
+

{ + if (file.type === 'application/vnd.google-apps.folder' || file.isFolder) { + handleFolderClick(file.id, file.name); + } + }} + > + {file.name} +

+

+ {file.size && `${formatBytes(file.size)} • `}Modified {formatDate(file.modifiedTime)} +

- )} -
-

{ - if (file.type === 'application/vnd.google-apps.folder' || file.isFolder) { - handleFolderClick(file.id, file.name); - } - }} - > - {file.name} -

-

- {file.size} • Modified {file.modifiedTime} -

-
- ))} -
+ ))} +
+ +
+ {hasMoreFiles && !isLoadingFiles && ( + + )} + {isLoadingFiles && ( +
+
+ Loading more files... +
+ )} +{!hasMoreFiles && !isLoadingFiles && ( + All files loaded + )} +
+ + + )}
+ + +
)} diff --git a/frontend/src/utils/stringUtils.ts b/frontend/src/utils/stringUtils.ts index e87a7af3..89c69df2 100644 --- a/frontend/src/utils/stringUtils.ts +++ b/frontend/src/utils/stringUtils.ts @@ -2,3 +2,12 @@ export function truncate(str: string, n: number) { // slices long strings and ends with ... return str.length > n ? str.slice(0, n - 1) + '...' : str; } + +export function formatBytes(bytes: number | null): string { + if (!bytes || bytes <= 0) return ''; + + const k = 1024; + const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return `${parseFloat((bytes / Math.pow(k, i)).toFixed(2))} ${sizes[i]}`; +} From 5a9bc6d2bf7c2f3e262585008581016442512934 Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Thu, 4 Sep 2025 08:35:41 +0530 Subject: [PATCH 25/25] (feat:connector) infinite scroll file pick --- frontend/src/upload/Upload.tsx | 44 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index a014bc21..46a36f4c 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -1045,34 +1045,16 @@ function Upload({
- {hasMoreFiles && !isLoadingFiles && ( - - )} {isLoadingFiles && (
Loading more files...
)} -{!hasMoreFiles && !isLoadingFiles && ( + {!hasMoreFiles && !isLoadingFiles && ( All files loaded )}
- - )}
@@ -1137,6 +1119,30 @@ function Upload({ ); } + useEffect(() => { + const scrollContainer = scrollContainerRef.current; + + const handleScroll = () => { + if (!scrollContainer) return; + + const { scrollTop, scrollHeight, clientHeight } = scrollContainer; + const isNearBottom = scrollHeight - scrollTop - clientHeight < 50; + + if (isNearBottom && hasMoreFiles && !isLoadingFiles && nextPageToken) { + const sessionToken = getSessionToken(ingestor.type); + if (sessionToken) { + loadGoogleDriveFiles(sessionToken, currentFolderId, nextPageToken, true); + } + } + }; + + scrollContainer?.addEventListener('scroll', handleScroll); + + return () => { + scrollContainer?.removeEventListener('scroll', handleScroll); + }; + }, [hasMoreFiles, isLoadingFiles, nextPageToken, currentFolderId, ingestor.type]); + return (