(feat:connectors) separate layer

2026-01-20 14:00:55 +00:00 · 2025-08-26 01:38:36 +05:30
parent 15a9e97a1e
commit f09f1433a9
8 changed files with 125 additions and 33 deletions
--- a/application/parser/connectors/init.py
+++ b/application/parser/connectors/init.py
@@ -0,0 +1,11 @@
+"""
+External knowledge base connectors for DocsGPT.
+
+This module contains connectors for external knowledge bases and document storage systems
+that require authentication and specialized handling, separate from simple web scrapers.
+"""
+
+from .connector_creator import ConnectorCreator
+from .google_drive import GoogleDriveAuth, GoogleDriveLoader
+
+__all__ = ['ConnectorCreator', 'GoogleDriveAuth', 'GoogleDriveLoader']
--- a/application/parser/connectors/connector_creator.py
+++ b/application/parser/connectors/connector_creator.py
@@ -0,0 +1,57 @@
+from application.parser.connectors.google_drive.loader import GoogleDriveLoader
+
+
+class ConnectorCreator:
+    """
+    Factory class for creating external knowledge base connectors.
+    
+    These are different from remote loaders as they typically require
+    authentication and connect to external document storage systems.
+    """
+    
+    connectors = {
+        "google_drive": GoogleDriveLoader,
+    }
+
+    @classmethod
+    def create_connector(cls, connector_type, *args, **kwargs):
+        """
+        Create a connector instance for the specified type.
+        
+        Args:
+            connector_type: Type of connector to create (e.g., 'google_drive')
+            *args, **kwargs: Arguments to pass to the connector constructor
+            
+        Returns:
+            Connector instance
+            
+        Raises:
+            ValueError: If connector type is not supported
+        """
+        connector_class = cls.connectors.get(connector_type.lower())
+        if not connector_class:
+            raise ValueError(f"No connector class found for type {connector_type}")
+        return connector_class(*args, **kwargs)
+
+    @classmethod
+    def get_supported_connectors(cls):
+        """
+        Get list of supported connector types.
+        
+        Returns:
+            List of supported connector type strings
+        """
+        return list(cls.connectors.keys())
+
+    @classmethod
+    def is_supported(cls, connector_type):
+        """
+        Check if a connector type is supported.
+        
+        Args:
+            connector_type: Type of connector to check
+            
+        Returns:
+            True if supported, False otherwise
+        """
+        return connector_type.lower() in cls.connectors
--- a/application/parser/connectors/google_drive/init.py
+++ b/application/parser/connectors/google_drive/init.py
@@ -0,0 +1,10 @@
+"""
+Google Drive connector for DocsGPT.
+
+This module provides authentication and document loading capabilities for Google Drive.
+"""
+
+from .auth import GoogleDriveAuth
+from .loader import GoogleDriveLoader
+
+__all__ = ['GoogleDriveAuth', 'GoogleDriveLoader']
--- a/application/parser/connectors/google_drive/auth.py
+++ b/application/parser/connectors/google_drive/auth.py
--- a/application/parser/connectors/google_drive/loader.py
+++ b/application/parser/connectors/google_drive/loader.py
@@ -12,7 +12,7 @@ from googleapiclient.http import MediaIoBaseDownload
 from googleapiclient.errors import HttpError

 from application.parser.remote.base import BaseRemote
-from application.parser.remote.google_auth import GoogleDriveAuth
+from application.parser.connectors.google_drive.auth import GoogleDriveAuth
 from application.parser.schema.base import Document


@@ -329,7 +329,7 @@ class GoogleDriveLoader(BaseRemote):

            if e.resp.status in [401, 403]:
                logging.error(f"Authentication error downloading file {file_id}")
-              
+
                if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
                    logging.info(f"Attempting to refresh credentials for file {file_id}")
                    try:
@@ -406,10 +406,10 @@ class GoogleDriveLoader(BaseRemote):
        files_downloaded = 0
        try:
            os.makedirs(local_dir, exist_ok=True)
-            
+
            query = f"'{folder_id}' in parents and trashed=false"
            page_token = None
-            
+
            while True:
                results = self.service.files().list(
                    q=query,
@@ -417,15 +417,15 @@ class GoogleDriveLoader(BaseRemote):
                    pageToken=page_token,
                    pageSize=1000
                ).execute()
-                
+
                items = results.get('files', [])
                logging.info(f"Found {len(items)} items in folder {folder_id}")
-                
+
                for item in items:
                    item_name = item['name']
                    item_id = item['id']
                    mime_type = item['mimeType']
-                    
+
                    if mime_type == 'application/vnd.google-apps.folder':
                        if recursive:
                            # Create subfolder and recurse
@@ -446,13 +446,13 @@ class GoogleDriveLoader(BaseRemote):
                            logging.info(f"Downloaded file: {item_name}")
                        else:
                            logging.warning(f"Failed to download file: {item_name}")
-                
+
                page_token = results.get('nextPageToken')
                if not page_token:
                    break
-                    
+
            return files_downloaded
-            
+
        except Exception as e:
            logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True)
            return files_downloaded
@@ -513,7 +513,7 @@ class GoogleDriveLoader(BaseRemote):
                        folder_name = folder_metadata.get('name', '')
                        folder_path = os.path.join(local_dir, folder_name)
                        os.makedirs(folder_path, exist_ok=True)
-                        
+
                        folder_files = self._download_folder_recursive(
                            folder_id,
                            folder_path,
--- a/application/parser/remote/remote_creator.py
+++ b/application/parser/remote/remote_creator.py
@@ -3,17 +3,25 @@ from application.parser.remote.crawler_loader import CrawlerLoader
 from application.parser.remote.web_loader import WebLoader
 from application.parser.remote.reddit_loader import RedditPostsLoaderRemote
 from application.parser.remote.github_loader import GitHubLoader
-from application.parser.remote.google_drive_loader import GoogleDriveLoader


 class RemoteCreator:
+    """
+    Factory class for creating remote content loaders.
+
+    These loaders fetch content from remote web sources like URLs,
+    sitemaps, web crawlers, social media platforms, etc.
+
+    For external knowledge base connectors (like Google Drive),
+    use ConnectorCreator instead.
+    """
+
    loaders = {
        "url": WebLoader,
        "sitemap": SitemapLoader,
        "crawler": CrawlerLoader,
        "reddit": RedditPostsLoaderRemote,
        "github": GitHubLoader,
-        "google_drive": GoogleDriveLoader,
    }

    @classmethod