""" Shadow Graph implementation for GhostCrew. This module provides a lightweight knowledge graph that is built automatically from agent notes. It is used by the Orchestrator to compute strategic insights (e.g., "we have creds for X but haven't scanned it") without burdening the agents with graph management. Architecture: Notes (Source of Truth) -> Shadow Graph (Derived View) -> Insights (Strategic Hints) """ import logging import re from dataclasses import dataclass, field from typing import Any, Dict, List, Set import networkx as nx logger = logging.getLogger(__name__) @dataclass class GraphNode: """A node in the shadow graph.""" id: str type: str # host, service, credential, finding, artifact label: str metadata: Dict[str, Any] = field(default_factory=dict) def __hash__(self): return hash(self.id) @dataclass class GraphEdge: """An edge in the shadow graph.""" source: str target: str type: str # CONNECTS_TO, HAS_SERVICE, AUTH_ACCESS, RELATED_TO metadata: Dict[str, Any] = field(default_factory=dict) class ShadowGraph: """ A NetworkX-backed knowledge graph that derives its state from notes. """ def __init__(self): self.graph = nx.DiGraph() self._processed_notes: Set[str] = set() # Regex patterns for entity extraction self._ip_pattern = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b") self._port_pattern = re.compile(r"(\d{1,5})/(tcp|udp)") self._user_pattern = re.compile( r"(?:user|username)[:\s]+([a-zA-Z0-9_.-]+)", re.IGNORECASE ) self._source_pattern = re.compile( r"(?:found on|dumped from|extracted from|on host)\s+((?:\d{1,3}\.){3}\d{1,3})", re.IGNORECASE, ) def update_from_notes(self, notes: Dict[str, Dict[str, Any]]) -> None: """ Update the graph based on new notes. This method is idempotent and incremental. It only processes notes that haven't been seen before (based on key). """ for key, note_data in notes.items(): if key in self._processed_notes: continue # Handle legacy format if isinstance(note_data, str): content = note_data category = "info" metadata = {} status = "confirmed" else: content = note_data.get("content", "") category = note_data.get("category", "info") metadata = note_data.get("metadata", {}) status = note_data.get("status", "confirmed") self._process_note(key, content, category, metadata, status) self._processed_notes.add(key) def _process_note( self, key: str, content: str, category: str, metadata: Dict[str, Any], status: str, ) -> None: """Extract entities and relationships from a single note.""" # 1. Extract IPs (Hosts) # Prefer metadata if available hosts = [] # Check target in metadata if metadata.get("target"): target_ip = metadata["target"] # Validate it looks like an IP or hostname? For now just accept it. node_id = f"host:{target_ip}" self._add_node(node_id, "host", target_ip) hosts.append(node_id) # Check source in metadata if metadata.get("source"): source_ip = metadata["source"] node_id = f"host:{source_ip}" self._add_node(node_id, "host", source_ip) hosts.append(node_id) # Fallback to regex if no hosts found in metadata if not hosts: ips = self._ip_pattern.findall(content) for ip in ips: node_id = f"host:{ip}" self._add_node(node_id, "host", ip) hosts.append(node_id) # 2. Handle specific categories if category == "credential": self._process_credential(key, content, hosts, metadata, status) elif category == "finding": self._process_finding(key, content, hosts, metadata, status) elif category == "vulnerability": self._process_vulnerability(key, content, hosts, metadata, status) # 3. Link note to hosts (provenance) # We don't add the note itself as a node usually, but we could. # For now, we just use the note to build Host-to-Host or Host-to-Service links. def _add_node(self, node_id: str, node_type: str, label: str, **kwargs) -> None: """Add a node if it doesn't exist.""" if not self.graph.has_node(node_id): self.graph.add_node(node_id, type=node_type, label=label, **kwargs) def _add_edge(self, source: str, target: str, edge_type: str, **kwargs) -> None: """Add an edge.""" if self.graph.has_node(source) and self.graph.has_node(target): self.graph.add_edge(source, target, type=edge_type, **kwargs) def _process_credential( self, key: str, content: str, related_hosts: List[str], metadata: Dict[str, Any], status: str, ) -> None: """Process a credential note.""" # Skip if status is closed/filtered (invalid creds) if status in ["closed", "filtered"]: return # Extract username from metadata or regex username = metadata.get("username") if not username: user_match = self._user_pattern.search(content) username = user_match.group(1) if user_match else None cred_id = f"cred:{key}" label = f"Creds ({username})" if username else "Credentials" self._add_node(cred_id, "credential", label) # Check for "found on" source host source_host = None if metadata.get("source"): source_ip = metadata["source"] source_host = f"host:{source_ip}" else: source_match = self._source_pattern.search(content) if source_match: source_ip = source_match.group(1) source_host = f"host:{source_ip}" if source_host: # Add CONTAINS edge: Host -> Cred if self.graph.has_node(source_host): self._add_edge(source_host, cred_id, "CONTAINS") # Link cred to hosts it belongs to (or works on) for host_id in related_hosts: # If this host is the source, skip adding it as a target unless explicitly clear? # For now, if we identified it as source, assume it's NOT the target unless it's the only one? # Let's just exclude the source host from being an AUTH_ACCESS target to avoid loops, # unless we want to represent local privesc (which is valid). # But for pivoting, we care about A -> Cred -> B. # If we found a source, and this host is that source, treat it as CONTAINS (already done). # Otherwise, treat as AUTH_ACCESS. if source_host and host_id == source_host: continue # If the note says "ssh", assume SSH access protocol = "ssh" if "ssh" in content.lower() else "unknown" self._add_edge(cred_id, host_id, "AUTH_ACCESS", protocol=protocol) def _process_finding( self, key: str, content: str, related_hosts: List[str], metadata: Dict[str, Any], status: str, ) -> None: """Process a finding note (e.g., open ports).""" # Skip if status is closed/filtered if status in ["closed", "filtered"]: return # Filter related_hosts: If we have explicit target metadata, ONLY use that. # Otherwise, use all related hosts (fallback to regex behavior). target_hosts = related_hosts if metadata.get("target"): target_ip = metadata["target"] target_id = f"host:{target_ip}" # Only use the target if it's in the related_hosts list (sanity check) if target_id in related_hosts: target_hosts = [target_id] # Extract ports from metadata or regex ports = [] if metadata.get("port"): # Handle single port in metadata p = str(metadata["port"]) # Assume tcp if not specified? proto = "tcp" if "/" in p: p, proto = p.split("/") ports.append((p, proto)) # Always check regex too, in case metadata missed some regex_ports = self._port_pattern.findall(content) for p, proto in regex_ports: if (p, proto) not in ports: ports.append((p, proto)) for port, proto in ports: for host_id in target_hosts: service_id = f"service:{host_id}:{port}" # Add URL to label if present label = f"{port}/{proto}" if metadata.get("url"): label += f" ({metadata['url']})" self._add_node(service_id, "service", label) self._add_edge(host_id, service_id, "HAS_SERVICE", protocol=proto) def _process_vulnerability( self, key: str, content: str, related_hosts: List[str], metadata: Dict[str, Any], status: str, ) -> None: """Process a vulnerability note.""" # Skip if status is closed/filtered (patched or not vulnerable) if status in ["closed", "filtered"]: return # Filter related_hosts: If we have explicit target metadata, ONLY use that. target_hosts = related_hosts if metadata.get("target"): target_ip = metadata["target"] target_id = f"host:{target_ip}" if target_id in related_hosts: target_hosts = [target_id] vuln_id = f"vuln:{key}" # Try to extract CVE from metadata or regex label = "Vulnerability" if metadata.get("cve"): label = metadata["cve"] else: cve_match = re.search(r"CVE-\d{4}-\d{4,7}", content, re.IGNORECASE) if cve_match: label = cve_match.group(0) self._add_node(vuln_id, "vulnerability", label) for host_id in target_hosts: self._add_edge(host_id, vuln_id, "AFFECTED_BY") def get_strategic_insights(self) -> List[str]: """ Analyze the graph and return natural language insights for the Orchestrator. """ insights = [] # Insight 1: Unused Credentials # Find credentials that have AUTH_ACCESS to a host, but we haven't "explored" that host fully? # Or simply list valid access paths. for node, data in self.graph.nodes(data=True): if data.get("type") == "credential": # Find what it connects to targets = [v for u, v in self.graph.out_edges(node)] if targets: target_labels = [ self.graph.nodes[t].get("label", t) for t in targets ] insights.append( f"We have credentials that provide access to: {', '.join(target_labels)}" ) # Insight 2: High Value Targets (Hosts with many open ports/vulns) for node, data in self.graph.nodes(data=True): if data.get("type") == "host": # Count services services = [ v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "service" ] vulns = [ v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "vulnerability" ] if len(services) > 0 or len(vulns) > 0: insights.append( f"Host {data['label']} has {len(services)} services and {len(vulns)} known vulnerabilities." ) # Insight 3: Potential Pivots (Host A -> Cred -> Host B) # Use NetworkX to find paths from Credentials to Hosts that aren't directly connected attack_paths = self._find_attack_paths() if attack_paths: insights.extend(attack_paths) return insights def _find_attack_paths(self) -> List[str]: """ Find multi-step attack paths using shortest path algorithms. Example: Credential A -> Host A -> Credential B -> Host B """ paths = [] creds = [n for n, d in self.graph.nodes(data=True) if d["type"] == "credential"] hosts = [n for n, d in self.graph.nodes(data=True) if d["type"] == "host"] for cred in creds: for host in hosts: # Skip if directly connected (we already know we have access) if self.graph.has_edge(cred, host): continue try: # Find shortest path path = nx.shortest_path(self.graph, cred, host) # Only interesting if it involves intermediate steps if len(path) > 2: # Convert IDs to Labels for readability readable_path = [] for node_id in path: node_data = self.graph.nodes[node_id] readable_path.append(node_data.get("label", node_id)) paths.append(f"Attack Path Found: {' -> '.join(readable_path)}") except nx.NetworkXNoPath: continue return paths def to_mermaid(self) -> str: """Export graph to Mermaid flowchart format.""" lines = ["graph TD"] # Add nodes for node, data in self.graph.nodes(data=True): # Sanitize ID for mermaid safe_id = re.sub(r"[^a-zA-Z0-9]", "_", node) label = data.get("label", node).replace('"', "'") # Style based on type if data["type"] == "host": lines.append(f' {safe_id}["🖥️ {label}"]') elif data["type"] == "credential": lines.append(f' {safe_id}["🔑 {label}"]') elif data["type"] == "vulnerability": lines.append(f' {safe_id}["⚠️ {label}"]') elif data["type"] == "service": lines.append(f' {safe_id}["🔌 {label}"]') else: lines.append(f' {safe_id}["{label}"]') # Add edges for u, v, data in self.graph.edges(data=True): safe_u = re.sub(r"[^a-zA-Z0-9]", "_", u) safe_v = re.sub(r"[^a-zA-Z0-9]", "_", v) edge_label = data.get("type", "") lines.append(f" {safe_u} -->|{edge_label}| {safe_v}") return "\n".join(lines) def export_summary(self) -> str: """Export a text summary of the graph state.""" stats = { "hosts": len( [n for n, d in self.graph.nodes(data=True) if d["type"] == "host"] ), "creds": len( [n for n, d in self.graph.nodes(data=True) if d["type"] == "credential"] ), "vulns": len( [ n for n, d in self.graph.nodes(data=True) if d["type"] == "vulnerability" ] ), } return f"Graph State: {stats['hosts']} Hosts, {stats['creds']} Credentials, {stats['vulns']} Vulnerabilities"