pentestagent/ghostcrew/knowledge/graph.py

"""
Shadow Graph implementation for GhostCrew.

This module provides a lightweight knowledge graph that is built automatically
from agent notes. It is used by the Orchestrator to compute strategic insights
(e.g., "we have creds for X but haven't scanned it") without burdening the
agents with graph management.

Architecture:
    Notes (Source of Truth) -> Shadow Graph (Derived View) -> Insights (Strategic Hints)
"""

import logging
import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Tuple

import networkx as nx

logger = logging.getLogger(__name__)


@dataclass
class GraphNode:
    """A node in the shadow graph."""
    id: str
    type: str  # host, service, credential, finding, artifact
    label: str
    metadata: Dict[str, Any] = field(default_factory=dict)

    def __hash__(self):
        return hash(self.id)


@dataclass
class GraphEdge:
    """An edge in the shadow graph."""
    source: str
    target: str
    type: str  # CONNECTS_TO, HAS_SERVICE, AUTH_ACCESS, RELATED_TO
    metadata: Dict[str, Any] = field(default_factory=dict)


class ShadowGraph:
    """
    A NetworkX-backed knowledge graph that derives its state from notes.
    """

    def __init__(self):
        self.graph = nx.DiGraph()
        self._processed_notes: Set[str] = set()

        # Regex patterns for entity extraction
        self._ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
        self._port_pattern = re.compile(r'(\d{1,5})/(tcp|udp)')
        self._user_pattern = re.compile(r'user[:\s]+([a-zA-Z0-9_.-]+)', re.IGNORECASE)

    def update_from_notes(self, notes: Dict[str, Dict[str, Any]]) -> None:
        """
        Update the graph based on new notes.

        This method is idempotent and incremental. It only processes notes
        that haven't been seen before (based on key).
        """
        for key, note_data in notes.items():
            if key in self._processed_notes:
                continue

            # Handle legacy format
            if isinstance(note_data, str):
                content = note_data
                category = "info"
            else:
                content = note_data.get("content", "")
                category = note_data.get("category", "info")

            self._process_note(key, content, category)
            self._processed_notes.add(key)

    def _process_note(self, key: str, content: str, category: str) -> None:
        """Extract entities and relationships from a single note."""

        # 1. Extract IPs (Hosts)
        ips = self._ip_pattern.findall(content)
        hosts = []
        for ip in ips:
            node_id = f"host:{ip}"
            self._add_node(node_id, "host", ip)
            hosts.append(node_id)

        # 2. Handle specific categories
        if category == "credential":
            self._process_credential(key, content, hosts)
        elif category == "finding":
            self._process_finding(key, content, hosts)
        elif category == "vulnerability":
            self._process_vulnerability(key, content, hosts)

        # 3. Link note to hosts (provenance)
        # We don't add the note itself as a node usually, but we could.
        # For now, we just use the note to build Host-to-Host or Host-to-Service links.

    def _add_node(self, node_id: str, node_type: str, label: str, **kwargs) -> None:
        """Add a node if it doesn't exist."""
        if not self.graph.has_node(node_id):
            self.graph.add_node(node_id, type=node_type, label=label, **kwargs)

    def _add_edge(self, source: str, target: str, edge_type: str, **kwargs) -> None:
        """Add an edge."""
        if self.graph.has_node(source) and self.graph.has_node(target):
            self.graph.add_edge(source, target, type=edge_type, **kwargs)

    def _process_credential(self, key: str, content: str, related_hosts: List[str]) -> None:
        """Process a credential note."""
        # Extract username
        user_match = self._user_pattern.search(content)
        username = user_match.group(1) if user_match else "unknown"

        cred_id = f"cred:{key}"
        self._add_node(cred_id, "credential", f"Creds ({username})")

        # Link cred to hosts it belongs to (or works on)
        for host_id in related_hosts:
            # If the note says "ssh", assume SSH access
            protocol = "ssh" if "ssh" in content.lower() else "unknown"
            self._add_edge(cred_id, host_id, "AUTH_ACCESS", protocol=protocol)

    def _process_finding(self, key: str, content: str, related_hosts: List[str]) -> None:
        """Process a finding note (e.g., open ports)."""
        # Extract ports
        ports = self._port_pattern.findall(content)
        for port, proto in ports:
            for host_id in related_hosts:
                service_id = f"service:{host_id}:{port}"
                self._add_node(service_id, "service", f"{port}/{proto}")
                self._add_edge(host_id, service_id, "HAS_SERVICE", protocol=proto)

    def _process_vulnerability(self, key: str, content: str, related_hosts: List[str]) -> None:
        """Process a vulnerability note."""
        vuln_id = f"vuln:{key}"
        # Try to extract CVE
        cve_match = re.search(r'CVE-\d{4}-\d{4,7}', content, re.IGNORECASE)
        label = cve_match.group(0) if cve_match else "Vulnerability"

        self._add_node(vuln_id, "vulnerability", label)

        for host_id in related_hosts:
            self._add_edge(host_id, vuln_id, "AFFECTED_BY")

    def get_strategic_insights(self) -> List[str]:
        """
        Analyze the graph and return natural language insights for the Orchestrator.
        """
        insights = []

        # Insight 1: Unused Credentials
        # Find credentials that have AUTH_ACCESS to a host, but we haven't "explored" that host fully?
        # Or simply list valid access paths.
        for node, data in self.graph.nodes(data=True):
            if data.get("type") == "credential":
                # Find what it connects to
                targets = [v for u, v in self.graph.out_edges(node)]
                if targets:
                    target_labels = [self.graph.nodes[t].get("label", t) for t in targets]
                    insights.append(f"We have credentials that provide access to: {', '.join(target_labels)}")

        # Insight 2: High Value Targets (Hosts with many open ports/vulns)
        for node, data in self.graph.nodes(data=True):
            if data.get("type") == "host":
                # Count services
                services = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "service"]
                vulns = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "vulnerability"]

                if len(services) > 0 or len(vulns) > 0:
                    insights.append(f"Host {data['label']} has {len(services)} services and {len(vulns)} known vulnerabilities.")

        # Insight 3: Potential Pivots (Host A -> Cred -> Host B)
        # This is harder without explicit "source" of creds, but we can infer.

        return insights

    def export_summary(self) -> str:
        """Export a text summary of the graph state."""
        stats = {
            "hosts": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'host']),
            "creds": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'credential']),
            "vulns": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'vulnerability']),
        }
        return f"Graph State: {stats['hosts']} Hosts, {stats['creds']} Credentials, {stats['vulns']} Vulnerabilities"