mirror of
https://github.com/GH05TCREW/pentestagent.git
synced 2026-03-07 22:33:38 +00:00
feat(crew): introduce shadow knowledge graph for orchestrator decision-making
This commit is contained in:
189
ghostcrew/knowledge/graph.py
Normal file
189
ghostcrew/knowledge/graph.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
Shadow Graph implementation for GhostCrew.
|
||||
|
||||
This module provides a lightweight knowledge graph that is built automatically
|
||||
from agent notes. It is used by the Orchestrator to compute strategic insights
|
||||
(e.g., "we have creds for X but haven't scanned it") without burdening the
|
||||
agents with graph management.
|
||||
|
||||
Architecture:
|
||||
Notes (Source of Truth) -> Shadow Graph (Derived View) -> Insights (Strategic Hints)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import networkx as nx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphNode:
|
||||
"""A node in the shadow graph."""
|
||||
id: str
|
||||
type: str # host, service, credential, finding, artifact
|
||||
label: str
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.id)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphEdge:
|
||||
"""An edge in the shadow graph."""
|
||||
source: str
|
||||
target: str
|
||||
type: str # CONNECTS_TO, HAS_SERVICE, AUTH_ACCESS, RELATED_TO
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ShadowGraph:
|
||||
"""
|
||||
A NetworkX-backed knowledge graph that derives its state from notes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.graph = nx.DiGraph()
|
||||
self._processed_notes: Set[str] = set()
|
||||
|
||||
# Regex patterns for entity extraction
|
||||
self._ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
|
||||
self._port_pattern = re.compile(r'(\d{1,5})/(tcp|udp)')
|
||||
self._user_pattern = re.compile(r'user[:\s]+([a-zA-Z0-9_.-]+)', re.IGNORECASE)
|
||||
|
||||
def update_from_notes(self, notes: Dict[str, Dict[str, Any]]) -> None:
|
||||
"""
|
||||
Update the graph based on new notes.
|
||||
|
||||
This method is idempotent and incremental. It only processes notes
|
||||
that haven't been seen before (based on key).
|
||||
"""
|
||||
for key, note_data in notes.items():
|
||||
if key in self._processed_notes:
|
||||
continue
|
||||
|
||||
# Handle legacy format
|
||||
if isinstance(note_data, str):
|
||||
content = note_data
|
||||
category = "info"
|
||||
else:
|
||||
content = note_data.get("content", "")
|
||||
category = note_data.get("category", "info")
|
||||
|
||||
self._process_note(key, content, category)
|
||||
self._processed_notes.add(key)
|
||||
|
||||
def _process_note(self, key: str, content: str, category: str) -> None:
|
||||
"""Extract entities and relationships from a single note."""
|
||||
|
||||
# 1. Extract IPs (Hosts)
|
||||
ips = self._ip_pattern.findall(content)
|
||||
hosts = []
|
||||
for ip in ips:
|
||||
node_id = f"host:{ip}"
|
||||
self._add_node(node_id, "host", ip)
|
||||
hosts.append(node_id)
|
||||
|
||||
# 2. Handle specific categories
|
||||
if category == "credential":
|
||||
self._process_credential(key, content, hosts)
|
||||
elif category == "finding":
|
||||
self._process_finding(key, content, hosts)
|
||||
elif category == "vulnerability":
|
||||
self._process_vulnerability(key, content, hosts)
|
||||
|
||||
# 3. Link note to hosts (provenance)
|
||||
# We don't add the note itself as a node usually, but we could.
|
||||
# For now, we just use the note to build Host-to-Host or Host-to-Service links.
|
||||
|
||||
def _add_node(self, node_id: str, node_type: str, label: str, **kwargs) -> None:
|
||||
"""Add a node if it doesn't exist."""
|
||||
if not self.graph.has_node(node_id):
|
||||
self.graph.add_node(node_id, type=node_type, label=label, **kwargs)
|
||||
|
||||
def _add_edge(self, source: str, target: str, edge_type: str, **kwargs) -> None:
|
||||
"""Add an edge."""
|
||||
if self.graph.has_node(source) and self.graph.has_node(target):
|
||||
self.graph.add_edge(source, target, type=edge_type, **kwargs)
|
||||
|
||||
def _process_credential(self, key: str, content: str, related_hosts: List[str]) -> None:
|
||||
"""Process a credential note."""
|
||||
# Extract username
|
||||
user_match = self._user_pattern.search(content)
|
||||
username = user_match.group(1) if user_match else "unknown"
|
||||
|
||||
cred_id = f"cred:{key}"
|
||||
self._add_node(cred_id, "credential", f"Creds ({username})")
|
||||
|
||||
# Link cred to hosts it belongs to (or works on)
|
||||
for host_id in related_hosts:
|
||||
# If the note says "ssh", assume SSH access
|
||||
protocol = "ssh" if "ssh" in content.lower() else "unknown"
|
||||
self._add_edge(cred_id, host_id, "AUTH_ACCESS", protocol=protocol)
|
||||
|
||||
def _process_finding(self, key: str, content: str, related_hosts: List[str]) -> None:
|
||||
"""Process a finding note (e.g., open ports)."""
|
||||
# Extract ports
|
||||
ports = self._port_pattern.findall(content)
|
||||
for port, proto in ports:
|
||||
for host_id in related_hosts:
|
||||
service_id = f"service:{host_id}:{port}"
|
||||
self._add_node(service_id, "service", f"{port}/{proto}")
|
||||
self._add_edge(host_id, service_id, "HAS_SERVICE", protocol=proto)
|
||||
|
||||
def _process_vulnerability(self, key: str, content: str, related_hosts: List[str]) -> None:
|
||||
"""Process a vulnerability note."""
|
||||
vuln_id = f"vuln:{key}"
|
||||
# Try to extract CVE
|
||||
cve_match = re.search(r'CVE-\d{4}-\d{4,7}', content, re.IGNORECASE)
|
||||
label = cve_match.group(0) if cve_match else "Vulnerability"
|
||||
|
||||
self._add_node(vuln_id, "vulnerability", label)
|
||||
|
||||
for host_id in related_hosts:
|
||||
self._add_edge(host_id, vuln_id, "AFFECTED_BY")
|
||||
|
||||
def get_strategic_insights(self) -> List[str]:
|
||||
"""
|
||||
Analyze the graph and return natural language insights for the Orchestrator.
|
||||
"""
|
||||
insights = []
|
||||
|
||||
# Insight 1: Unused Credentials
|
||||
# Find credentials that have AUTH_ACCESS to a host, but we haven't "explored" that host fully?
|
||||
# Or simply list valid access paths.
|
||||
for node, data in self.graph.nodes(data=True):
|
||||
if data.get("type") == "credential":
|
||||
# Find what it connects to
|
||||
targets = [v for u, v in self.graph.out_edges(node)]
|
||||
if targets:
|
||||
target_labels = [self.graph.nodes[t].get("label", t) for t in targets]
|
||||
insights.append(f"We have credentials that provide access to: {', '.join(target_labels)}")
|
||||
|
||||
# Insight 2: High Value Targets (Hosts with many open ports/vulns)
|
||||
for node, data in self.graph.nodes(data=True):
|
||||
if data.get("type") == "host":
|
||||
# Count services
|
||||
services = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "service"]
|
||||
vulns = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "vulnerability"]
|
||||
|
||||
if len(services) > 0 or len(vulns) > 0:
|
||||
insights.append(f"Host {data['label']} has {len(services)} services and {len(vulns)} known vulnerabilities.")
|
||||
|
||||
# Insight 3: Potential Pivots (Host A -> Cred -> Host B)
|
||||
# This is harder without explicit "source" of creds, but we can infer.
|
||||
|
||||
return insights
|
||||
|
||||
def export_summary(self) -> str:
|
||||
"""Export a text summary of the graph state."""
|
||||
stats = {
|
||||
"hosts": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'host']),
|
||||
"creds": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'credential']),
|
||||
"vulns": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'vulnerability']),
|
||||
}
|
||||
return f"Graph State: {stats['hosts']} Hosts, {stats['creds']} Credentials, {stats['vulns']} Vulnerabilities"
|
||||
Reference in New Issue
Block a user