diff --git a/.env-template b/.env-template
index 8b53112c..13575fc3 100644
--- a/.env-template
+++ b/.env-template
@@ -12,4 +12,17 @@ EMBEDDINGS_KEY=
OPENAI_API_BASE=
OPENAI_API_VERSION=
AZURE_DEPLOYMENT_NAME=
-AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
\ No newline at end of file
+AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
+
+#Azure AD Application (client) ID
+MICROSOFT_CLIENT_ID=your-azure-ad-client-id
+#Azure AD Application client secret
+MICROSOFT_CLIENT_SECRET=your-azure-ad-client-secret
+#Azure AD Tenant ID (or 'common' for multi-tenant)
+MICROSOFT_TENANT_ID=your-azure-ad-tenant-id
+#If you are using a Microsoft Entra ID tenant,
+#configure the AUTHORITY variable as
+#"https://login.microsoftonline.com/TENANT_GUID"
+#or "https://login.microsoftonline.com/contoso.onmicrosoft.com".
+#Alternatively, use "https://login.microsoftonline.com/common" for multi-tenant app.
+MICROSOFT_AUTHORITY=https://{tenantId}.ciamlogin.com/{tenantId}
diff --git a/application/agents/react_agent.py b/application/agents/react_agent.py
index 116fa4aa..92be75f6 100644
--- a/application/agents/react_agent.py
+++ b/application/agents/react_agent.py
@@ -235,4 +235,4 @@ class ReActAgent(BaseAgent):
)
except Exception as e:
logger.error(f"Error extracting content: {e}")
- return "".join(collected)
+ return "".join(collected)
\ No newline at end of file
diff --git a/application/api/connector/routes.py b/application/api/connector/routes.py
index 91fc3f0b..913e5349 100644
--- a/application/api/connector/routes.py
+++ b/application/api/connector/routes.py
@@ -146,20 +146,19 @@ class ConnectorsCallback(Resource):
session_token = str(uuid.uuid4())
try:
- credentials = auth.create_credentials_from_token_info(token_info)
- service = auth.build_drive_service(credentials)
- user_info = service.about().get(fields="user").execute()
- user_email = user_info.get('user', {}).get('emailAddress', 'Connected User')
+ if provider == "google_drive":
+ credentials = auth.create_credentials_from_token_info(token_info)
+ service = auth.build_drive_service(credentials)
+ user_info = service.about().get(fields="user").execute()
+ user_email = user_info.get('user', {}).get('emailAddress', 'Connected User')
+ else:
+ user_email = token_info.get('user_info', {}).get('email', 'Connected User')
+
except Exception as e:
current_app.logger.warning(f"Could not get user info: {e}")
user_email = 'Connected User'
- sanitized_token_info = {
- "access_token": token_info.get("access_token"),
- "refresh_token": token_info.get("refresh_token"),
- "token_uri": token_info.get("token_uri"),
- "expiry": token_info.get("expiry")
- }
+ sanitized_token_info = auth.sanitize_token_info(token_info)
sessions_collection.find_one_and_update(
{"_id": ObjectId(state_object_id), "provider": provider},
@@ -201,12 +200,12 @@ class ConnectorsCallback(Resource):
@connectors_ns.route("/api/connectors/files")
class ConnectorFiles(Resource):
@api.expect(api.model("ConnectorFilesModel", {
- "provider": fields.String(required=True),
- "session_token": fields.String(required=True),
- "folder_id": fields.String(required=False),
- "limit": fields.Integer(required=False),
+ "provider": fields.String(required=True),
+ "session_token": fields.String(required=True),
+ "folder_id": fields.String(required=False),
+ "limit": fields.Integer(required=False),
"page_token": fields.String(required=False),
- "search_query": fields.String(required=False)
+ "search_query": fields.String(required=False),
}))
@api.doc(description="List files from a connector provider (supports pagination and search)")
def post(self):
@@ -214,11 +213,8 @@ class ConnectorFiles(Resource):
data = request.get_json()
provider = data.get('provider')
session_token = data.get('session_token')
- folder_id = data.get('folder_id')
limit = data.get('limit', 10)
- page_token = data.get('page_token')
- search_query = data.get('search_query')
-
+
if not provider or not session_token:
return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400)
@@ -231,15 +227,12 @@ class ConnectorFiles(Resource):
return make_response(jsonify({"success": False, "error": "Invalid or unauthorized session"}), 401)
loader = ConnectorCreator.create_connector(provider, session_token)
+
+ generic_keys = {'provider', 'session_token'}
input_config = {
- 'limit': limit,
- 'list_only': True,
- 'session_token': session_token,
- 'folder_id': folder_id,
- 'page_token': page_token
+ k: v for k, v in data.items() if k not in generic_keys
}
- if search_query:
- input_config['search_query'] = search_query
+ input_config['list_only'] = True
documents = loader.load_data(input_config)
@@ -306,12 +299,7 @@ class ConnectorValidateSession(Resource):
if is_expired and token_info.get('refresh_token'):
try:
refreshed_token_info = auth.refresh_access_token(token_info.get('refresh_token'))
- sanitized_token_info = {
- "access_token": refreshed_token_info.get("access_token"),
- "refresh_token": refreshed_token_info.get("refresh_token"),
- "token_uri": refreshed_token_info.get("token_uri"),
- "expiry": refreshed_token_info.get("expiry")
- }
+ sanitized_token_info = auth.sanitize_token_info(refreshed_token_info)
sessions_collection.update_one(
{"session_token": session_token},
{"$set": {"token_info": sanitized_token_info}}
@@ -328,12 +316,18 @@ class ConnectorValidateSession(Resource):
"error": "Session token has expired. Please reconnect."
}), 401)
- return make_response(jsonify({
+ _base_fields = {"access_token", "refresh_token", "token_uri", "expiry"}
+ provider_extras = {k: v for k, v in token_info.items() if k not in _base_fields}
+
+ response_data = {
"success": True,
"expired": False,
"user_email": session.get('user_email', 'Connected User'),
- "access_token": token_info.get('access_token')
- }), 200)
+ "access_token": token_info.get('access_token'),
+ **provider_extras,
+ }
+
+ return make_response(jsonify(response_data), 200)
except Exception as e:
current_app.logger.error(f"Error validating connector session: {e}", exc_info=True)
return make_response(jsonify({"success": False, "error": "Failed to validate session"}), 500)
diff --git a/application/api/user/agents/routes.py b/application/api/user/agents/routes.py
index 64fa7bed..8f313a7e 100644
--- a/application/api/user/agents/routes.py
+++ b/application/api/user/agents/routes.py
@@ -1412,4 +1412,4 @@ class RemoveSharedAgent(Resource):
current_app.logger.error(f"Error removing shared agent: {err}")
return make_response(
jsonify({"success": False, "message": "Server error"}), 500
- )
+ )
\ No newline at end of file
diff --git a/application/core/settings.py b/application/core/settings.py
index 5c424074..5cdf7f09 100644
--- a/application/core/settings.py
+++ b/application/core/settings.py
@@ -65,8 +65,14 @@ class Settings(BaseSettings):
"http://127.0.0.1:7091/api/connectors/callback" ##add redirect url as it is to your provider's console(gcp)
)
+ # Microsoft Entra ID (Azure AD) integration
+ MICROSOFT_CLIENT_ID: Optional[str] = None # Azure AD Application (client) ID
+ MICROSOFT_CLIENT_SECRET: Optional[str] = None # Azure AD Application client secret
+ MICROSOFT_TENANT_ID: Optional[str] = "common" # Azure AD Tenant ID (or 'common' for multi-tenant)
+ MICROSOFT_AUTHORITY: Optional[str] = None # e.g., "https://login.microsoftonline.com/{tenant_id}"
+
# GitHub source
- GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access
+ GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access
# LLM Cache
CACHE_REDIS_URL: str = "redis://localhost:6379/2"
diff --git a/application/parser/connectors/base.py b/application/parser/connectors/base.py
index dfb6de87..b9b7f78f 100644
--- a/application/parser/connectors/base.py
+++ b/application/parser/connectors/base.py
@@ -62,15 +62,26 @@ class BaseConnectorAuth(ABC):
def is_token_expired(self, token_info: Dict[str, Any]) -> bool:
"""
Check if a token is expired.
-
+
Args:
token_info: Token information dictionary
-
+
Returns:
True if token is expired, False otherwise
"""
pass
+ def sanitize_token_info(self, token_info: Dict[str, Any], **extra_fields) -> Dict[str, Any]:
+ """Extract the fields safe to persist in the session store.
+ """
+ return {
+ "access_token": token_info.get("access_token"),
+ "refresh_token": token_info.get("refresh_token"),
+ "token_uri": token_info.get("token_uri"),
+ "expiry": token_info.get("expiry"),
+ **extra_fields,
+ }
+
class BaseConnectorLoader(ABC):
"""
diff --git a/application/parser/connectors/connector_creator.py b/application/parser/connectors/connector_creator.py
index bf4456ca..609e6407 100644
--- a/application/parser/connectors/connector_creator.py
+++ b/application/parser/connectors/connector_creator.py
@@ -1,5 +1,7 @@
from application.parser.connectors.google_drive.loader import GoogleDriveLoader
from application.parser.connectors.google_drive.auth import GoogleDriveAuth
+from application.parser.connectors.share_point.auth import SharePointAuth
+from application.parser.connectors.share_point.loader import SharePointLoader
class ConnectorCreator:
@@ -12,10 +14,12 @@ class ConnectorCreator:
connectors = {
"google_drive": GoogleDriveLoader,
+ "share_point": SharePointLoader,
}
auth_providers = {
"google_drive": GoogleDriveAuth,
+ "share_point": SharePointAuth,
}
@classmethod
diff --git a/application/parser/connectors/google_drive/auth.py b/application/parser/connectors/google_drive/auth.py
index f5fbe056..e24368c9 100644
--- a/application/parser/connectors/google_drive/auth.py
+++ b/application/parser/connectors/google_drive/auth.py
@@ -232,10 +232,6 @@ class GoogleDriveAuth(BaseConnectorAuth):
if missing_fields:
raise ValueError(f"Missing required token fields: {missing_fields}")
- if 'client_id' not in token_info:
- token_info['client_id'] = settings.GOOGLE_CLIENT_ID
- if 'client_secret' not in token_info:
- token_info['client_secret'] = settings.GOOGLE_CLIENT_SECRET
if 'token_uri' not in token_info:
token_info['token_uri'] = 'https://oauth2.googleapis.com/token'
diff --git a/application/parser/connectors/google_drive/loader.py b/application/parser/connectors/google_drive/loader.py
index c96a08be..9605517c 100644
--- a/application/parser/connectors/google_drive/loader.py
+++ b/application/parser/connectors/google_drive/loader.py
@@ -327,15 +327,10 @@ class GoogleDriveLoader(BaseConnectorLoader):
content_bytes = file_io.getvalue()
try:
- content = content_bytes.decode('utf-8')
+ return content_bytes.decode('utf-8')
except UnicodeDecodeError:
- try:
- content = content_bytes.decode('latin-1')
- except UnicodeDecodeError:
- logging.error(f"Could not decode file {file_id} as text")
- return None
-
- return content
+ logging.error(f"Could not decode file {file_id} as text")
+ return None
except HttpError as e:
logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}")
diff --git a/application/parser/connectors/share_point/__init__.py b/application/parser/connectors/share_point/__init__.py
new file mode 100644
index 00000000..b83bb56f
--- /dev/null
+++ b/application/parser/connectors/share_point/__init__.py
@@ -0,0 +1,10 @@
+"""
+Share Point connector package for DocsGPT.
+
+This module provides authentication and document loading capabilities for Share Point.
+"""
+
+from .auth import SharePointAuth
+from .loader import SharePointLoader
+
+__all__ = ['SharePointAuth', 'SharePointLoader']
\ No newline at end of file
diff --git a/application/parser/connectors/share_point/auth.py b/application/parser/connectors/share_point/auth.py
new file mode 100644
index 00000000..1da894b1
--- /dev/null
+++ b/application/parser/connectors/share_point/auth.py
@@ -0,0 +1,152 @@
+import datetime
+import logging
+from typing import Optional, Dict, Any
+
+from msal import ConfidentialClientApplication
+
+from application.core.settings import settings
+from application.parser.connectors.base import BaseConnectorAuth
+
+logger = logging.getLogger(__name__)
+
+
+class SharePointAuth(BaseConnectorAuth):
+ """
+ Handles Microsoft OAuth 2.0 authentication for SharePoint/OneDrive.
+
+ Note: Files.Read scope allows access to files the user has granted access to,
+ similar to Google Drive's drive.file scope.
+ """
+
+ SCOPES = [
+ "Files.Read",
+ "Sites.Read.All",
+ "User.Read",
+ ]
+
+ def __init__(self):
+ self.client_id = settings.MICROSOFT_CLIENT_ID
+ self.client_secret = settings.MICROSOFT_CLIENT_SECRET
+
+ if not self.client_id:
+ raise ValueError(
+ "Microsoft OAuth credentials not configured. Please set MICROSOFT_CLIENT_ID in settings."
+ )
+
+ if not self.client_secret:
+ raise ValueError(
+ "Microsoft OAuth credentials not configured. Please set MICROSOFT_CLIENT_SECRET in settings."
+ )
+
+ self.redirect_uri = settings.CONNECTOR_REDIRECT_BASE_URI
+ self.tenant_id = settings.MICROSOFT_TENANT_ID
+ self.authority = getattr(settings, "MICROSOFT_AUTHORITY", f"https://login.microsoftonline.com/{self.tenant_id}")
+
+ self.auth_app = ConfidentialClientApplication(
+ client_id=self.client_id,
+ client_credential=self.client_secret,
+ authority=self.authority
+ )
+
+ def get_authorization_url(self, state: Optional[str] = None) -> str:
+ return self.auth_app.get_authorization_request_url(
+ scopes=self.SCOPES, state=state, redirect_uri=self.redirect_uri
+ )
+
+ def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]:
+ result = self.auth_app.acquire_token_by_authorization_code(
+ code=authorization_code,
+ scopes=self.SCOPES,
+ redirect_uri=self.redirect_uri
+ )
+
+ if "error" in result:
+ logger.error("Token exchange failed: %s", result.get("error_description"))
+ raise ValueError(f"Error acquiring token: {result.get('error_description')}")
+
+ return self.map_token_response(result)
+
+ def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]:
+ result = self.auth_app.acquire_token_by_refresh_token(refresh_token=refresh_token, scopes=self.SCOPES)
+
+ if "error" in result:
+ logger.error("Token refresh failed: %s", result.get("error_description"))
+ raise ValueError(f"Error refreshing token: {result.get('error_description')}")
+
+ return self.map_token_response(result)
+
+ def get_token_info_from_session(self, session_token: str) -> Dict[str, Any]:
+ try:
+ from application.core.mongo_db import MongoDB
+ from application.core.settings import settings
+
+ mongo = MongoDB.get_client()
+ db = mongo[settings.MONGO_DB_NAME]
+
+ sessions_collection = db["connector_sessions"]
+ session = sessions_collection.find_one({"session_token": session_token})
+
+ if not session:
+ raise ValueError(f"Invalid session token: {session_token}")
+
+ if "token_info" not in session:
+ raise ValueError("Session missing token information")
+
+ token_info = session["token_info"]
+ if not token_info:
+ raise ValueError("Invalid token information")
+
+ required_fields = ["access_token", "refresh_token"]
+ missing_fields = [field for field in required_fields if field not in token_info or not token_info.get(field)]
+ if missing_fields:
+ raise ValueError(f"Missing required token fields: {missing_fields}")
+
+ if 'token_uri' not in token_info:
+ token_info['token_uri'] = f"https://login.microsoftonline.com/{settings.MICROSOFT_TENANT_ID}/oauth2/v2.0/token"
+
+ return token_info
+
+ except Exception as e:
+ logger.error("Failed to retrieve token from session: %s", e)
+ raise ValueError(f"Failed to retrieve SharePoint token information: {str(e)}")
+
+ def is_token_expired(self, token_info: Dict[str, Any]) -> bool:
+ if not token_info:
+ return True
+
+ expiry_timestamp = token_info.get("expiry")
+
+ if expiry_timestamp is None:
+ return True
+
+ current_timestamp = int(datetime.datetime.now().timestamp())
+ return (expiry_timestamp - current_timestamp) < 60
+
+ def sanitize_token_info(self, token_info: Dict[str, Any], **extra_fields) -> Dict[str, Any]:
+ return super().sanitize_token_info(
+ token_info,
+ allows_shared_content=token_info.get("allows_shared_content", False),
+ **extra_fields,
+ )
+
+ PERSONAL_ACCOUNT_TENANT_ID = "9188040d-6c67-4c5b-b112-36a304b66dad"
+
+ def _allows_shared_content(self, id_token_claims: Dict[str, Any]) -> bool:
+ """Return True when the account is a work/school tenant that can access SharePoint shared content."""
+ tid = id_token_claims.get("tid", "")
+ return bool(tid) and tid != self.PERSONAL_ACCOUNT_TENANT_ID
+
+ def map_token_response(self, result) -> Dict[str, Any]:
+ claims = result.get("id_token_claims", {})
+ return {
+ "access_token": result.get("access_token"),
+ "refresh_token": result.get("refresh_token"),
+ "token_uri": claims.get("iss"),
+ "scopes": result.get("scope"),
+ "expiry": claims.get("exp"),
+ "allows_shared_content": self._allows_shared_content(claims),
+ "user_info": {
+ "name": claims.get("name"),
+ "email": claims.get("preferred_username"),
+ },
+ }
diff --git a/application/parser/connectors/share_point/loader.py b/application/parser/connectors/share_point/loader.py
new file mode 100644
index 00000000..191e3e54
--- /dev/null
+++ b/application/parser/connectors/share_point/loader.py
@@ -0,0 +1,649 @@
+"""
+SharePoint/OneDrive loader for DocsGPT.
+Loads documents from SharePoint/OneDrive using Microsoft Graph API.
+"""
+
+import functools
+import logging
+import os
+from typing import List, Dict, Any, Optional, Tuple
+from urllib.parse import quote
+
+import requests
+
+from application.parser.connectors.base import BaseConnectorLoader
+from application.parser.connectors.share_point.auth import SharePointAuth
+from application.parser.schema.base import Document
+
+
+def _retry_on_auth_failure(func):
+ """Retry once after refreshing the access token on 401/403 responses."""
+ @functools.wraps(func)
+ def wrapper(self, *args, **kwargs):
+ try:
+ return func(self, *args, **kwargs)
+ except requests.exceptions.HTTPError as e:
+ if e.response is not None and e.response.status_code in (401, 403):
+ logging.info(f"Auth failure in {func.__name__}, refreshing token and retrying")
+ try:
+ new_token_info = self.auth.refresh_access_token(self.refresh_token)
+ self.access_token = new_token_info.get('access_token')
+ except Exception as refresh_error:
+ raise ValueError(
+ f"Authentication failed and could not be refreshed: {refresh_error}"
+ ) from e
+ return func(self, *args, **kwargs)
+ raise
+ return wrapper
+
+
+class SharePointLoader(BaseConnectorLoader):
+
+ SUPPORTED_MIME_TYPES = {
+ 'application/pdf': '.pdf',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+ 'application/msword': '.doc',
+ 'application/vnd.ms-powerpoint': '.ppt',
+ 'application/vnd.ms-excel': '.xls',
+ 'text/plain': '.txt',
+ 'text/csv': '.csv',
+ 'text/html': '.html',
+ 'text/markdown': '.md',
+ 'text/x-rst': '.rst',
+ 'application/json': '.json',
+ 'application/epub+zip': '.epub',
+ 'application/rtf': '.rtf',
+ 'image/jpeg': '.jpg',
+ 'image/png': '.png',
+ }
+
+ EXTENSION_TO_MIME = {v: k for k, v in SUPPORTED_MIME_TYPES.items()}
+
+ GRAPH_API_BASE = "https://graph.microsoft.com/v1.0"
+
+ def __init__(self, session_token: str):
+ self.auth = SharePointAuth()
+ self.session_token = session_token
+
+ token_info = self.auth.get_token_info_from_session(session_token)
+ self.access_token = token_info.get('access_token')
+ self.refresh_token = token_info.get('refresh_token')
+ self.allows_shared_content = token_info.get('allows_shared_content', False)
+
+ if not self.access_token:
+ raise ValueError("No access token found in session")
+
+ self.next_page_token = None
+
+ def _get_headers(self) -> Dict[str, str]:
+ return {
+ 'Authorization': f'Bearer {self.access_token}',
+ 'Accept': 'application/json'
+ }
+
+ def _ensure_valid_token(self):
+ if not self.access_token:
+ raise ValueError("No access token available")
+
+ token_info = {'access_token': self.access_token, 'expiry': None}
+ if self.auth.is_token_expired(token_info):
+ logging.info("Token expired, attempting refresh")
+ try:
+ new_token_info = self.auth.refresh_access_token(self.refresh_token)
+ self.access_token = new_token_info.get('access_token')
+ except Exception:
+ raise ValueError("Failed to refresh access token")
+
+ def _get_item_url(self, item_ref: str) -> str:
+ if ':' in item_ref:
+ drive_id, item_id = item_ref.split(':', 1)
+ return f"{self.GRAPH_API_BASE}/drives/{drive_id}/items/{item_id}"
+ return f"{self.GRAPH_API_BASE}/me/drive/items/{item_ref}"
+
+ def _process_file(self, file_metadata: Dict[str, Any], load_content: bool = True) -> Optional[Document]:
+ try:
+ drive_item_id = file_metadata.get('id')
+ file_name = file_metadata.get('name', 'Unknown')
+ file_data = file_metadata.get('file', {})
+ mime_type = file_data.get('mimeType', 'application/octet-stream')
+
+ if mime_type not in self.SUPPORTED_MIME_TYPES:
+ logging.info(f"Skipping unsupported file type: {mime_type} for file {file_name}")
+ return None
+
+ doc_metadata = {
+ 'file_name': file_name,
+ 'mime_type': mime_type,
+ 'size': file_metadata.get('size'),
+ 'created_time': file_metadata.get('createdDateTime'),
+ 'modified_time': file_metadata.get('lastModifiedDateTime'),
+ 'source': 'share_point'
+ }
+
+ if not load_content:
+ return Document(
+ text="",
+ doc_id=drive_item_id,
+ extra_info=doc_metadata
+ )
+
+ content = self._download_file_content(drive_item_id)
+ if content is None:
+ logging.warning(f"Could not load content for file {file_name} ({drive_item_id})")
+ return None
+
+ return Document(
+ text=content,
+ doc_id=drive_item_id,
+ extra_info=doc_metadata
+ )
+
+ except Exception as e:
+ logging.error(f"Error processing file: {e}")
+ return None
+
+ def load_data(self, inputs: Dict[str, Any]) -> List[Document]:
+ try:
+ documents: List[Document] = []
+
+ folder_id = inputs.get('folder_id')
+ file_ids = inputs.get('file_ids', [])
+ limit = inputs.get('limit', 100)
+ list_only = inputs.get('list_only', False)
+ load_content = not list_only
+ page_token = inputs.get('page_token')
+ search_query = inputs.get('search_query')
+ self.next_page_token = None
+
+ shared = inputs.get('shared', False)
+
+ if file_ids:
+ for file_id in file_ids:
+ try:
+ doc = self._load_file_by_id(file_id, load_content=load_content)
+ if doc:
+ if not search_query or (
+ search_query.lower() in doc.extra_info.get('file_name', '').lower()
+ ):
+ documents.append(doc)
+ except Exception as e:
+ logging.error(f"Error loading file {file_id}: {e}")
+ continue
+ elif shared:
+ if not self.allows_shared_content:
+ logging.warning("Shared content is only available for work/school Microsoft accounts")
+ return []
+ documents = self._list_shared_items(
+ limit=limit,
+ load_content=load_content,
+ page_token=page_token,
+ search_query=search_query
+ )
+ else:
+ parent_id = folder_id if folder_id else 'root'
+ documents = self._list_items_in_parent(
+ parent_id,
+ limit=limit,
+ load_content=load_content,
+ page_token=page_token,
+ search_query=search_query
+ )
+
+ logging.info(f"Loaded {len(documents)} documents from SharePoint/OneDrive")
+ return documents
+
+ except Exception as e:
+ logging.error(f"Error loading data from SharePoint/OneDrive: {e}", exc_info=True)
+ raise
+
+ @_retry_on_auth_failure
+ def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]:
+ self._ensure_valid_token()
+
+ try:
+ url = self._get_item_url(file_id)
+ params = {'$select': 'id,name,file,createdDateTime,lastModifiedDateTime,size'}
+ response = requests.get(url, headers=self._get_headers(), params=params)
+ response.raise_for_status()
+
+ file_metadata = response.json()
+ return self._process_file(file_metadata, load_content=load_content)
+
+ except requests.exceptions.HTTPError:
+ raise
+ except Exception as e:
+ logging.error(f"Error loading file {file_id}: {e}")
+ return None
+
+ @_retry_on_auth_failure
+ def _list_items_in_parent(self, parent_id: str, limit: int = 100, load_content: bool = False, page_token: Optional[str] = None, search_query: Optional[str] = None) -> List[Document]:
+ self._ensure_valid_token()
+
+ documents: List[Document] = []
+
+ try:
+ url = f"{self._get_item_url(parent_id)}/children"
+ params = {'$top': min(100, limit) if limit else 100, '$select': 'id,name,file,folder,createdDateTime,lastModifiedDateTime,size'}
+ if page_token:
+ params['$skipToken'] = page_token
+
+ if search_query:
+ encoded_query = quote(search_query, safe='')
+ if ':' in parent_id:
+ drive_id = parent_id.split(':', 1)[0]
+ search_url = f"{self.GRAPH_API_BASE}/drives/{drive_id}/root/search(q='{encoded_query}')"
+ else:
+ search_url = f"{self.GRAPH_API_BASE}/me/drive/search(q='{encoded_query}')"
+ response = requests.get(search_url, headers=self._get_headers(), params=params)
+ else:
+ response = requests.get(url, headers=self._get_headers(), params=params)
+
+ response.raise_for_status()
+
+ results = response.json()
+
+ items = results.get('value', [])
+ for item in items:
+ if 'folder' in item:
+ doc_metadata = {
+ 'file_name': item.get('name', 'Unknown'),
+ 'mime_type': 'folder',
+ 'size': item.get('size'),
+ 'created_time': item.get('createdDateTime'),
+ 'modified_time': item.get('lastModifiedDateTime'),
+ 'source': 'share_point',
+ 'is_folder': True
+ }
+ documents.append(Document(text="", doc_id=item.get('id'), extra_info=doc_metadata))
+ else:
+ doc = self._process_file(item, load_content=load_content)
+ if doc:
+ documents.append(doc)
+
+ if limit and len(documents) >= limit:
+ break
+
+ next_link = results.get('@odata.nextLink')
+ if next_link:
+ from urllib.parse import urlparse, parse_qs
+ parsed = urlparse(next_link)
+ query_params = parse_qs(parsed.query)
+ skiptoken_list = query_params.get('$skiptoken')
+ if skiptoken_list:
+ self.next_page_token = skiptoken_list[0]
+ else:
+ self.next_page_token = None
+ else:
+ self.next_page_token = None
+ return documents
+
+ except Exception as e:
+ logging.error(f"Error listing items under parent {parent_id}: {e}")
+ return documents
+
+
+
+
+ def _resolve_mime_type(self, resource: Dict[str, Any]) -> Tuple[str, bool]:
+ """Resolve mime type from resource, falling back to file extension."""
+ file_data = resource.get('file', {})
+ mime_type = file_data.get('mimeType') if file_data else None
+
+ if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
+ return mime_type, True
+
+ name = resource.get('name', '')
+ ext = os.path.splitext(name)[1].lower()
+ if ext in self.EXTENSION_TO_MIME:
+ return self.EXTENSION_TO_MIME[ext], True
+
+ return mime_type or 'application/octet-stream', False
+
+ def _get_user_drive_web_url(self) -> Optional[str]:
+ """Fetch the current user's OneDrive web URL for KQL path exclusion."""
+ try:
+ response = requests.get(
+ f"{self.GRAPH_API_BASE}/me/drive",
+ headers=self._get_headers(),
+ params={'$select': 'webUrl'}
+ )
+ response.raise_for_status()
+ return response.json().get('webUrl')
+ except Exception as e:
+ logging.warning(f"Could not fetch user drive web URL: {e}")
+ return None
+
+ def _build_shared_kql_query(self, search_query: Optional[str], user_drive_url: Optional[str]) -> str:
+ """Build KQL query string that excludes the user's own drive items."""
+ base_query = search_query if search_query else "*"
+ if user_drive_url:
+ return f'{base_query} AND -path:"{user_drive_url}"'
+ return base_query
+
+ def _list_shared_items(self, limit: int = 100, load_content: bool = False, page_token: Optional[str] = None, search_query: Optional[str] = None) -> List[Document]:
+ """Fetch shared drive items using Microsoft Graph Search API with local offset paging.
+
+ We always fetch up to a fixed maximum number of hits from Graph (single request),
+ then page through that array locally using `page_token` as a simple integer offset.
+ This avoids relying on buggy or inconsistent remote `from`/`size` semantics.
+ """
+ self._ensure_valid_token()
+ documents: List[Document] = []
+
+ try:
+ user_drive_url = self._get_user_drive_web_url()
+ query_text = self._build_shared_kql_query(search_query, user_drive_url)
+
+ url = f"{self.GRAPH_API_BASE}/search/query"
+ page_size = 500 # maximum number of hits we care about for selection
+
+ body = {
+ "requests": [
+ {
+ "entityTypes": ["driveItem"],
+ "query": {"queryString": query_text},
+ "from": 0,
+ "size": page_size,
+ }
+ ]
+ }
+
+ headers = self._get_headers()
+ headers["Content-Type"] = "application/json"
+ response = requests.post(url, headers=headers, json=body)
+ response.raise_for_status()
+ results = response.json()
+
+ search_response = results.get("value", [])
+ if not search_response:
+ logging.warning("Search API returned empty value array")
+ self.next_page_token = None
+ return documents
+
+ hits_containers = search_response[0].get("hitsContainers", [])
+ if not hits_containers:
+ logging.warning("Search API returned no hitsContainers")
+ self.next_page_token = None
+ return documents
+
+ container = hits_containers[0]
+ total = container.get("total", 0)
+ raw_hits = container.get("hits", [])
+
+ # Deduplicate by effective item ID (driveId:itemId) to avoid the same
+ # resource appearing multiple times across the result set.
+ deduped_hits = []
+ seen_ids = set()
+ for hit in raw_hits:
+ resource = hit.get("resource", {})
+ item_id = resource.get("id")
+ drive_id = resource.get("parentReference", {}).get("driveId")
+ effective_id = f"{drive_id}:{item_id}" if drive_id and item_id else item_id
+ if not effective_id or effective_id in seen_ids:
+ continue
+ seen_ids.add(effective_id)
+ deduped_hits.append(hit)
+
+ hits = deduped_hits
+ logging.info(
+ f"Search API returned {total} total results, {len(raw_hits)} raw hits, {len(hits)} unique hits in this batch"
+ )
+ try:
+ offset = int(page_token) if page_token is not None else 0
+ except (TypeError, ValueError):
+ logging.warning(
+ f"Invalid page_token '{page_token}' for shared items search, defaulting to 0"
+ )
+ offset = 0
+
+ if offset < 0:
+ offset = 0
+ if offset >= len(hits):
+ self.next_page_token = None
+ return documents
+
+ end_index = offset + limit if limit else len(hits)
+ end_index = min(end_index, len(hits))
+
+ for hit in hits[offset:end_index]:
+ resource = hit.get("resource", {})
+ item_name = resource.get("name", "Unknown")
+ item_id = resource.get("id")
+ drive_id = resource.get("parentReference", {}).get("driveId")
+
+ effective_id = f"{drive_id}:{item_id}" if drive_id and item_id else item_id
+
+ is_folder = "folder" in resource
+
+ if is_folder:
+ doc_metadata = {
+ "file_name": item_name,
+ "mime_type": "folder",
+ "size": resource.get("size"),
+ "created_time": resource.get("createdDateTime"),
+ "modified_time": resource.get("lastModifiedDateTime"),
+ "source": "share_point",
+ "is_folder": True,
+ }
+ documents.append(
+ Document(text="", doc_id=effective_id, extra_info=doc_metadata)
+ )
+ else:
+ mime_type, supported = self._resolve_mime_type(resource)
+ if not supported:
+ logging.info(
+ f"Skipping unsupported shared file: {item_name} (mime: {mime_type})"
+ )
+ continue
+
+ doc_metadata = {
+ "file_name": item_name,
+ "mime_type": mime_type,
+ "size": resource.get("size"),
+ "created_time": resource.get("createdDateTime"),
+ "modified_time": resource.get("lastModifiedDateTime"),
+ "source": "share_point",
+ }
+
+ content = ""
+ if load_content:
+ content = self._download_file_content(effective_id) or ""
+
+ documents.append(
+ Document(text=content, doc_id=effective_id, extra_info=doc_metadata)
+ )
+
+ if limit and end_index < len(hits):
+ self.next_page_token = str(end_index)
+ else:
+ self.next_page_token = None
+
+ return documents
+
+ except Exception as e:
+ logging.error(f"Error listing shared items via search API: {e}", exc_info=True)
+ return documents
+
+ @_retry_on_auth_failure
+ def _download_file_content(self, file_id: str) -> Optional[str]:
+ self._ensure_valid_token()
+
+ try:
+ url = f"{self._get_item_url(file_id)}/content"
+ response = requests.get(url, headers=self._get_headers())
+ response.raise_for_status()
+
+ try:
+ return response.content.decode('utf-8')
+ except UnicodeDecodeError:
+ logging.error(f"Could not decode file {file_id} as text")
+ return None
+
+ except requests.exceptions.HTTPError:
+ raise
+ except Exception as e:
+ logging.error(f"Error downloading file {file_id}: {e}")
+ return None
+
+ def _download_single_file(self, file_id: str, local_dir: str) -> bool:
+ try:
+ url = self._get_item_url(file_id)
+ params = {'$select': 'id,name,file'}
+ response = requests.get(url, headers=self._get_headers(), params=params)
+ response.raise_for_status()
+
+ metadata = response.json()
+ file_name = metadata.get('name', 'unknown')
+ file_data = metadata.get('file', {})
+ mime_type = file_data.get('mimeType', 'application/octet-stream')
+
+ if mime_type not in self.SUPPORTED_MIME_TYPES:
+ logging.info(f"Skipping unsupported file type: {mime_type}")
+ return False
+
+ os.makedirs(local_dir, exist_ok=True)
+ full_path = os.path.join(local_dir, file_name)
+
+ download_url = f"{self._get_item_url(file_id)}/content"
+ download_response = requests.get(download_url, headers=self._get_headers())
+ download_response.raise_for_status()
+
+ with open(full_path, 'wb') as f:
+ f.write(download_response.content)
+
+ return True
+ except Exception as e:
+ logging.error(f"Error in _download_single_file: {e}")
+ return False
+
+ def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int:
+ files_downloaded = 0
+ try:
+ os.makedirs(local_dir, exist_ok=True)
+
+ url = f"{self._get_item_url(folder_id)}/children"
+ params = {'$top': 1000}
+
+ while url:
+ response = requests.get(url, headers=self._get_headers(), params=params)
+ response.raise_for_status()
+
+ results = response.json()
+ items = results.get('value', [])
+ logging.info(f"Found {len(items)} items in folder {folder_id}")
+
+ for item in items:
+ item_name = item.get('name', 'unknown')
+ item_id = item.get('id')
+
+ if 'folder' in item:
+ if recursive:
+ subfolder_path = os.path.join(local_dir, item_name)
+ os.makedirs(subfolder_path, exist_ok=True)
+ subfolder_files = self._download_folder_recursive(
+ item_id,
+ subfolder_path,
+ recursive
+ )
+ files_downloaded += subfolder_files
+ logging.info(f"Downloaded {subfolder_files} files from subfolder {item_name}")
+ else:
+ success = self._download_single_file(item_id, local_dir)
+ if success:
+ files_downloaded += 1
+ logging.info(f"Downloaded file: {item_name}")
+ else:
+ logging.warning(f"Failed to download file: {item_name}")
+
+ url = results.get('@odata.nextLink')
+
+ return files_downloaded
+
+ except Exception as e:
+ logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True)
+ return files_downloaded
+
+ def _download_folder_contents(self, folder_id: str, local_dir: str, recursive: bool = True) -> int:
+ try:
+ self._ensure_valid_token()
+ return self._download_folder_recursive(folder_id, local_dir, recursive)
+ except Exception as e:
+ logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True)
+ return 0
+
+ def _download_file_to_directory(self, file_id: str, local_dir: str) -> bool:
+ try:
+ self._ensure_valid_token()
+ return self._download_single_file(file_id, local_dir)
+ except Exception as e:
+ logging.error(f"Error downloading file {file_id}: {e}", exc_info=True)
+ return False
+
+ def download_to_directory(self, local_dir: str, source_config: Dict[str, Any] = None) -> Dict[str, Any]:
+ if source_config is None:
+ source_config = {}
+
+ config = source_config if source_config else getattr(self, 'config', {})
+ files_downloaded = 0
+
+ try:
+ folder_ids = config.get('folder_ids', [])
+ file_ids = config.get('file_ids', [])
+ recursive = config.get('recursive', True)
+
+ if file_ids:
+ if isinstance(file_ids, str):
+ file_ids = [file_ids]
+
+ for file_id in file_ids:
+ if self._download_file_to_directory(file_id, local_dir):
+ files_downloaded += 1
+
+ if folder_ids:
+ if isinstance(folder_ids, str):
+ folder_ids = [folder_ids]
+
+ for folder_id in folder_ids:
+ try:
+ url = self._get_item_url(folder_id)
+ params = {'$select': 'id,name'}
+ response = requests.get(url, headers=self._get_headers(), params=params)
+ response.raise_for_status()
+
+ folder_metadata = response.json()
+ folder_name = folder_metadata.get('name', '')
+ folder_path = os.path.join(local_dir, folder_name)
+ os.makedirs(folder_path, exist_ok=True)
+
+ folder_files = self._download_folder_recursive(
+ folder_id,
+ folder_path,
+ recursive
+ )
+ files_downloaded += folder_files
+ logging.info(f"Downloaded {folder_files} files from folder {folder_name}")
+ except Exception as e:
+ logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True)
+
+ if not file_ids and not folder_ids:
+ raise ValueError("No folder_ids or file_ids provided for download")
+
+ return {
+ "files_downloaded": files_downloaded,
+ "directory_path": local_dir,
+ "empty_result": files_downloaded == 0,
+ "source_type": "share_point",
+ "config_used": config
+ }
+
+ except Exception as e:
+ return {
+ "files_downloaded": files_downloaded,
+ "directory_path": local_dir,
+ "empty_result": True,
+ "source_type": "share_point",
+ "config_used": config,
+ "error": str(e)
+ }
diff --git a/application/requirements.txt b/application/requirements.txt
index e99b8614..9565d40b 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -47,6 +47,7 @@ markupsafe==3.0.3
marshmallow>=3.18.0,<5.0.0
mpmath==1.3.0
multidict==6.7.0
+msal==1.34.0
mypy-extensions==1.1.0
networkx==3.6.1
numpy==2.4.0
@@ -95,4 +96,4 @@ werkzeug>=3.1.0
yarl==1.22.0
markdownify==1.2.2
tldextract==5.3.0
-websockets==15.0.1
+websockets==15.0.1
\ No newline at end of file
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 649ef13c..2766d0b5 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -8103,6 +8103,7 @@
"https://github.com/sponsors/katex"
],
"license": "MIT",
+ "license": "MIT",
"dependencies": {
"commander": "^8.3.0"
},
diff --git a/frontend/src/agents/AgentCard.tsx b/frontend/src/agents/AgentCard.tsx
index f4c88c9b..6df11e16 100644
--- a/frontend/src/agents/AgentCard.tsx
+++ b/frontend/src/agents/AgentCard.tsx
@@ -320,4 +320,4 @@ export default function AgentCard({
/>
);
-}
+}
\ No newline at end of file
diff --git a/frontend/src/agents/AgentsList.tsx b/frontend/src/agents/AgentsList.tsx
index f550a993..e9aa1bcd 100644
--- a/frontend/src/agents/AgentsList.tsx
+++ b/frontend/src/agents/AgentsList.tsx
@@ -603,4 +603,4 @@ function AgentSection({
);
-}
+}
\ No newline at end of file
diff --git a/frontend/src/agents/NewAgent.tsx b/frontend/src/agents/NewAgent.tsx
index 7ff752bd..d5d28647 100644
--- a/frontend/src/agents/NewAgent.tsx
+++ b/frontend/src/agents/NewAgent.tsx
@@ -1405,4 +1405,4 @@ function AddPromptModal({
handleAddPrompt={handleAddPrompt}
/>
);
-}
+}
\ No newline at end of file
diff --git a/frontend/src/agents/agents.config.ts b/frontend/src/agents/agents.config.ts
index 3569600e..35761e2e 100644
--- a/frontend/src/agents/agents.config.ts
+++ b/frontend/src/agents/agents.config.ts
@@ -41,4 +41,4 @@ export const agentSectionsConfig = [
selectData: selectSharedAgents,
updateAction: setSharedAgents,
},
-];
+];
\ No newline at end of file
diff --git a/frontend/src/agents/index.tsx b/frontend/src/agents/index.tsx
index 988345ad..64481985 100644
--- a/frontend/src/agents/index.tsx
+++ b/frontend/src/agents/index.tsx
@@ -18,4 +18,4 @@ export default function Agents() {