diff --git a/.env-template b/.env-template index e93f0363..b733bdf9 100644 --- a/.env-template +++ b/.env-template @@ -6,4 +6,20 @@ VITE_API_STREAMING=true OPENAI_API_BASE= OPENAI_API_VERSION= AZURE_DEPLOYMENT_NAME= -AZURE_EMBEDDINGS_DEPLOYMENT_NAME= \ No newline at end of file +AZURE_EMBEDDINGS_DEPLOYMENT_NAME= + +#Azure AD Application (client) ID +MICROSOFT_CLIENT_ID=your-azure-ad-client-id +#Azure AD Application client secret +MICROSOFT_CLIENT_SECRET=your-azure-ad-client-secret +#Azure AD Tenant ID (or 'common' for multi-tenant) +MICROSOFT_TENANT_ID=your-azure-ad-tenant-id +#Your project's redirect URI that you registered in Azure Portal. +#For example: http://localhost:5000/redirect +MICROSOFT_REDIRECT_URI=http://localhost:7091/api/connectors/callback/ms_entra_id +#If you are using a Microsoft Entra ID tenant, +#configure the AUTHORITY variable as +#"https://login.microsoftonline.com/TENANT_GUID" +#or "https://login.microsoftonline.com/contoso.onmicrosoft.com". +#Alternatively, use "https://login.microsoftonline.com/common" for multi-tenant app. +MICROSOFT_AUTHORITY=https://{tenentId}.ciamlogin.com/{tenentId} diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py index 49307058..8b9cee38 100644 --- a/application/api/connector/routes.py +++ b/application/api/connector/routes.py @@ -298,10 +298,14 @@ class ConnectorsCallback(Resource): session_token = str(uuid.uuid4()) try: - credentials = auth.create_credentials_from_token_info(token_info) - service = auth.build_drive_service(credentials) - user_info = service.about().get(fields="user").execute() - user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + if provider == "google_drive": + credentials = auth.create_credentials_from_token_info(token_info) + service = auth.build_drive_service(credentials) + user_info = service.about().get(fields="user").execute() + user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') + else: + user_email = token_info.get('user_info', {}).get('email', 'Connected User') + except Exception as e: current_app.logger.warning(f"Could not get user info: {e}") user_email = 'Connected User' diff --git a/application/core/settings.py b/application/core/settings.py index 4475c443..674b2f9c 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -51,6 +51,13 @@ class Settings(BaseSettings): "http://127.0.0.1:7091/api/connectors/callback" ##add redirect url as it is to your provider's console(gcp) ) + # Microsoft Entra ID (Azure AD) integration + MICROSOFT_CLIENT_ID: Optional[str] = None # Azure AD Application (client) ID + MICROSOFT_CLIENT_SECRET: Optional[str] = None # Azure AD Application client secret + MICROSOFT_TENANT_ID: Optional[str] = "common" # Azure AD Tenant ID (or 'common' for multi-tenant) + MICROSOFT_REDIRECT_URI: Optional[str] = "http://localhost:7091/api/connectors/callback" # Your project's redirect URI that you registered in Azure Portal. + MICROSOFT_AUTHORITY: Optional[str] = None # e.g., "https://login.microsoftonline.com/{tenant_id}" + # LLM Cache CACHE_REDIS_URL: str = "redis://localhost:6379/2" diff --git a/application/parser/connectors/connector_creator.py b/application/parser/connectors/connector_creator.py index bf4456ca..609e6407 100644 --- a/application/parser/connectors/connector_creator.py +++ b/application/parser/connectors/connector_creator.py @@ -1,5 +1,7 @@ from application.parser.connectors.google_drive.loader import GoogleDriveLoader from application.parser.connectors.google_drive.auth import GoogleDriveAuth +from application.parser.connectors.share_point.auth import SharePointAuth +from application.parser.connectors.share_point.loader import SharePointLoader class ConnectorCreator: @@ -12,10 +14,12 @@ class ConnectorCreator: connectors = { "google_drive": GoogleDriveLoader, + "share_point": SharePointLoader, } auth_providers = { "google_drive": GoogleDriveAuth, + "share_point": SharePointAuth, } @classmethod diff --git a/application/parser/connectors/share_point/auth.py b/application/parser/connectors/share_point/auth.py new file mode 100644 index 00000000..504732cc --- /dev/null +++ b/application/parser/connectors/share_point/auth.py @@ -0,0 +1,91 @@ +import logging +import datetime +from typing import Optional, Dict, Any + +from msal import ConfidentialClientApplication + +from application.core.settings import settings +from application.parser.connectors.base import BaseConnectorAuth + + +class SharePointAuth(BaseConnectorAuth): + """ + Handles Microsoft OAuth 2.0 authentication. + + # Documentation: + - https://learn.microsoft.com/en-us/entra/identity-platform/v2-oauth2-auth-code-flow + - https://learn.microsoft.com/en-gb/entra/msal/python/ + """ + + # Microsoft Graph scopes for SharePoint access + SCOPES = [ + "User.Read", + ] + + def __init__(self): + self.client_id = settings.MICROSOFT_CLIENT_ID + self.client_secret = settings.MICROSOFT_CLIENT_SECRET + + if not self.client_id or not self.client_secret: + raise ValueError( + "Microsoft OAuth credentials not configured. Please set MICROSOFT_CLIENT_ID and MICROSOFT_CLIENT_SECRET in settings." + ) + + self.redirect_uri = settings.MICROSOFT_REDIRECT_URI + self.tenant_id = settings.MICROSOFT_TENANT_ID + self.authority = getattr(settings, "MICROSOFT_AUTHORITY", f"https://{self.tenant_id}.ciamlogin.com/{self.tenant_id}") + + self.auth_app = ConfidentialClientApplication( + client_id=self.client_id, client_credential=self.client_secret, authority=self.authority + ) + + def get_authorization_url(self, state: Optional[str] = None) -> str: + return self.auth_app.get_authorization_request_url( + scopes=self.SCOPES, state=state, redirect_uri=self.redirect_uri + ) + + def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: + result = self.auth_app.acquire_token_by_authorization_code( + code=authorization_code, scopes=self.SCOPES, redirect_uri=self.redirect_uri + ) + + if "error" in result: + logging.error(f"Error acquiring token: {result.get('error_description')}") + raise ValueError(f"Error acquiring token: {result.get('error_description')}") + + return self.map_token_response(result) + + def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: + result = self.auth_app.acquire_token_by_refresh_token(refresh_token=refresh_token, scopes=self.SCOPES) + + if "error" in result: + logging.error(f"Error acquiring token: {result.get('error_description')}") + raise ValueError(f"Error acquiring token: {result.get('error_description')}") + + return self.map_token_response(result) + + def is_token_expired(self, token_info: Dict[str, Any]) -> bool: + if not token_info or "expiry" not in token_info: + # If no expiry info, consider token expired to be safe + return True + + # Get expiry timestamp and current time + expiry_timestamp = token_info["expiry"] + current_timestamp = int(datetime.datetime.now().timestamp()) + + # Token is expired if current time is greater than or equal to expiry time + return current_timestamp >= expiry_timestamp + + def map_token_response(self, result) -> Dict[str, Any]: + return { + "access_token": result.get("access_token"), + "refresh_token": result.get("refresh_token"), + "token_uri": result.get("id_token_claims", {}).get("iss"), + "scopes": result.get("scope"), + "expiry": result.get("id_token_claims", {}).get("exp"), + "user_info": { + "name": result.get("id_token_claims", {}).get("name"), + "email": result.get("id_token_claims", {}).get("preferred_username"), + }, + "raw_token": result, + } diff --git a/application/parser/connectors/share_point/loader.py b/application/parser/connectors/share_point/loader.py new file mode 100644 index 00000000..ea081afe --- /dev/null +++ b/application/parser/connectors/share_point/loader.py @@ -0,0 +1,44 @@ +from typing import List, Dict, Any +from application.parser.connectors.base import BaseConnectorLoader +from application.parser.schema.base import Document + + +class SharePointLoader(BaseConnectorLoader): + def __init__(self, session_token: str): + pass + + def load_data(self, inputs: Dict[str, Any]) -> List[Document]: + """ + Load documents from the external knowledge base. + + Args: + inputs: Configuration dictionary containing: + - file_ids: Optional list of specific file IDs to load + - folder_ids: Optional list of folder IDs to browse/download + - limit: Maximum number of items to return + - list_only: If True, return metadata without content + - recursive: Whether to recursively process folders + + Returns: + List of Document objects + """ + pass + + def download_to_directory(self, local_dir: str, source_config: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Download files/folders to a local directory. + + Args: + local_dir: Local directory path to download files to + source_config: Configuration for what to download + + Returns: + Dictionary containing download results: + - files_downloaded: Number of files downloaded + - directory_path: Path where files were downloaded + - empty_result: Whether no files were downloaded + - source_type: Type of connector + - config_used: Configuration that was used + - error: Error message if download failed (optional) + """ + pass diff --git a/application/requirements.txt b/application/requirements.txt index 3882bd6d..396ea358 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -40,6 +40,7 @@ markupsafe==3.0.2 marshmallow==3.26.1 mpmath==1.3.0 multidict==6.4.3 +msal==1.34.0 mypy-extensions==1.0.0 networkx==3.4.2 numpy==2.2.1 @@ -87,4 +88,4 @@ werkzeug>=3.1.0,<3.1.2 yarl==1.20.0 markdownify==1.1.0 tldextract==5.1.3 -websockets==14.1 +websockets==14.1 \ No newline at end of file