import logging import os from datetime import datetime, timezone from typing import Dict, List, Optional, Union import yaml from bson import ObjectId from bson.dbref import DBRef from dotenv import load_dotenv from pymongo import MongoClient from application.agents.tools.tool_manager import ToolManager from application.api.user.tasks import ingest_remote load_dotenv() tool_config = {} tool_manager = ToolManager(config=tool_config) class DatabaseSeeder: def __init__(self, db): self.db = db self.tools_collection = self.db["user_tools"] self.sources_collection = self.db["sources"] self.agents_collection = self.db["agents"] self.prompts_collection = self.db["prompts"] self.system_user_id = "system" self.logger = logging.getLogger(__name__) def seed_initial_data(self, config_path: str = None, force=False): """Main entry point for seeding all initial data""" if not force and self._is_already_seeded(): self.logger.info("Database already seeded. Use force=True to reseed.") return config_path = config_path or os.path.join( os.path.dirname(__file__), "config", "premade_agents.yaml" ) try: with open(config_path, "r") as f: config = yaml.safe_load(f) self._seed_from_config(config) except Exception as e: self.logger.error(f"Failed to load seeding config: {str(e)}") raise def _seed_from_config(self, config: Dict): """Seed all data from configuration""" self.logger.info("🌱 Starting seeding...") if not config.get("agents"): self.logger.warning("No agents found in config") return used_tool_ids = set() for agent_config in config["agents"]: try: self.logger.info(f"Processing agent: {agent_config['name']}") # 1. Handle Source source_result = self._handle_source(agent_config) if source_result is False: self.logger.error( f"Skipping agent {agent_config['name']} due to source ingestion failure" ) continue source_id = source_result # 2. Handle Tools tool_ids = self._handle_tools(agent_config) if len(tool_ids) == 0: self.logger.warning( f"No valid tools for agent {agent_config['name']}" ) used_tool_ids.update(tool_ids) # 3. Handle Prompt prompt_id = self._handle_prompt(agent_config) # 4. Create Agent agent_data = { "user": self.system_user_id, "name": agent_config["name"], "description": agent_config["description"], "image": agent_config.get("image", ""), "source": ( DBRef("sources", ObjectId(source_id)) if source_id else "" ), "tools": [str(tid) for tid in tool_ids], "agent_type": agent_config["agent_type"], "prompt_id": prompt_id or agent_config.get("prompt_id", "default"), "chunks": agent_config.get("chunks", "0"), "retriever": agent_config.get("retriever", ""), "status": "template", "createdAt": datetime.now(timezone.utc), "updatedAt": datetime.now(timezone.utc), } existing = self.agents_collection.find_one( {"user": self.system_user_id, "name": agent_config["name"]} ) if existing: self.logger.info(f"Updating existing agent: {agent_config['name']}") self.agents_collection.update_one( {"_id": existing["_id"]}, {"$set": agent_data} ) agent_id = existing["_id"] else: self.logger.info(f"Creating new agent: {agent_config['name']}") result = self.agents_collection.insert_one(agent_data) agent_id = result.inserted_id self.logger.info( f"Successfully processed agent: {agent_config['name']} (ID: {agent_id})" ) except Exception as e: self.logger.error( f"Error processing agent {agent_config['name']}: {str(e)}" ) continue self.logger.info("✅ Database seeding completed") def _handle_source(self, agent_config: Dict) -> Union[ObjectId, None, bool]: """Handle source ingestion and return source ID""" if not agent_config.get("source"): self.logger.info( "No source provided for agent - will create agent without source" ) return None source_config = agent_config["source"] self.logger.info(f"Ingesting source: {source_config['url']}") try: existing = self.sources_collection.find_one( {"user": self.system_user_id, "remote_data": source_config["url"]} ) if existing: self.logger.info(f"Source already exists: {existing['_id']}") return existing["_id"] # Ingest new source using worker task = ingest_remote.delay( source_data=source_config["url"], job_name=source_config["name"], user=self.system_user_id, loader=source_config.get("loader", "url"), ) result = task.get(timeout=300) if not task.successful(): raise Exception(f"Source ingestion failed: {result}") source_id = None if isinstance(result, dict) and "id" in result: source_id = result["id"] else: raise Exception(f"Source ingestion result missing 'id': {result}") self.logger.info(f"Source ingested successfully: {source_id}") return source_id except Exception as e: self.logger.error(f"Failed to ingest source: {str(e)}") return False def _handle_tools(self, agent_config: Dict) -> List[ObjectId]: """Handle tool creation and return list of tool IDs""" tool_ids = [] if not agent_config.get("tools"): return tool_ids for tool_config in agent_config["tools"]: try: tool_name = tool_config["name"] processed_config = self._process_config(tool_config.get("config", {})) self.logger.info(f"Processing tool: {tool_name}") existing = self.tools_collection.find_one( { "user": self.system_user_id, "name": tool_name, "config": processed_config, } ) if existing: self.logger.info(f"Tool already exists: {existing['_id']}") tool_ids.append(existing["_id"]) continue tool_data = { "user": self.system_user_id, "name": tool_name, "displayName": tool_config.get("display_name", tool_name), "description": tool_config.get("description", ""), "actions": tool_manager.tools[tool_name].get_actions_metadata(), "config": processed_config, "status": True, } result = self.tools_collection.insert_one(tool_data) tool_ids.append(result.inserted_id) self.logger.info(f"Created new tool: {result.inserted_id}") except Exception as e: self.logger.error(f"Failed to process tool {tool_name}: {str(e)}") continue return tool_ids def _handle_prompt(self, agent_config: Dict) -> Optional[str]: """Handle prompt creation and return prompt ID""" if not agent_config.get("prompt"): return None prompt_config = agent_config["prompt"] prompt_name = prompt_config.get("name", f"{agent_config['name']} Prompt") prompt_content = prompt_config.get("content", "") if not prompt_content: self.logger.warning( f"No prompt content provided for agent {agent_config['name']}" ) return None self.logger.info(f"Processing prompt: {prompt_name}") try: existing = self.prompts_collection.find_one( { "user": self.system_user_id, "name": prompt_name, "content": prompt_content, } ) if existing: self.logger.info(f"Prompt already exists: {existing['_id']}") return str(existing["_id"]) prompt_data = { "name": prompt_name, "content": prompt_content, "user": self.system_user_id, } result = self.prompts_collection.insert_one(prompt_data) prompt_id = str(result.inserted_id) self.logger.info(f"Created new prompt: {prompt_id}") return prompt_id except Exception as e: self.logger.error(f"Failed to process prompt {prompt_name}: {str(e)}") return None def _process_config(self, config: Dict) -> Dict: """Process config values to replace environment variables""" processed = {} for key, value in config.items(): if ( isinstance(value, str) and value.startswith("${") and value.endswith("}") ): env_var = value[2:-1] processed[key] = os.getenv(env_var, "") else: processed[key] = value return processed def _is_already_seeded(self) -> bool: """Check if premade agents already exist""" return self.agents_collection.count_documents({"user": self.system_user_id}) > 0 @classmethod def initialize_from_env(cls, worker=None): """Factory method to create seeder from environment""" mongo_uri = os.getenv("MONGO_URI", "mongodb://localhost:27017") db_name = os.getenv("MONGO_DB_NAME", "docsgpt") client = MongoClient(mongo_uri) db = client[db_name] return cls(db)