feat(crew): introduce shadow knowledge graph for orchestrator decision-making

This commit is contained in:
GH05TCREW
2025-12-13 09:40:03 -07:00
parent a3de1ab4b4
commit 1990c0fcb5
4 changed files with 260 additions and 0 deletions

View File

@@ -9,6 +9,7 @@ from ..prompts import ghost_crew
from .models import CrewState, WorkerCallback
from .tools import create_crew_tools
from .worker_pool import WorkerPool
from ...knowledge.graph import ShadowGraph
if TYPE_CHECKING:
from ...llm import LLM
@@ -39,6 +40,7 @@ class CrewOrchestrator:
self.state = CrewState.IDLE
self.pool: Optional[WorkerPool] = None
self.graph = ShadowGraph()
self._messages: List[Dict[str, Any]] = []
def _get_system_prompt(self) -> str:
@@ -53,9 +55,68 @@ class CrewOrchestrator:
"\n".join(tool_lines) if tool_lines else "No tools available"
)
# Get saved notes if available
notes_context = ""
graph_insights = []
try:
from ...tools.notes import get_all_notes_sync
notes = get_all_notes_sync()
if notes:
# Update shadow graph
self.graph.update_from_notes(notes)
graph_insights = self.graph.get_strategic_insights()
# Group by category
grouped = {}
for key, data in notes.items():
# Handle legacy string format just in case
if isinstance(data, str):
cat = "info"
content = data
else:
cat = data.get("category", "info")
content = data.get("content", "")
# Truncate long notes in system prompt to save tokens
if len(content) > 200:
content = content[:197] + "..."
if cat not in grouped:
grouped[cat] = []
grouped[cat].append(f"- {key}: {content}")
# Format output with specific order
sections = []
order = ["credential", "vulnerability", "finding", "artifact", "task", "info"]
for cat in order:
if cat in grouped:
header = cat.title() + "s"
if cat == "info":
header = "General Information"
sections.append(f"## {header}")
sections.append("\n".join(grouped[cat]))
# Add any remaining categories
for cat in sorted(grouped.keys()):
if cat not in order:
sections.append(f"## {cat.title()}")
sections.append("\n".join(grouped[cat]))
notes_context = "\n\n".join(sections)
except Exception:
pass # Notes not available
# Format insights for prompt
insights_text = ""
if graph_insights:
insights_text = "\n\n## Strategic Insights (Graph Analysis)\n" + "\n".join(f"- {i}" for i in graph_insights)
return ghost_crew.render(
target=self.target or "Not specified",
prior_context=self.prior_context or "None - starting fresh",
notes_context=notes_context + insights_text,
worker_tools=worker_tools_formatted,
environment={
"os": platform.system(),

View File

@@ -21,6 +21,12 @@ This is an authorized penetration testing engagement. All targets are in scope.
{{ prior_context }}
{% endif %}
{% if notes_context %}
## Saved Notes (Shared Knowledge)
(Notes may be truncated. Use `notes(action='read', key='...')` to see full content if needed.)
{{ notes_context }}
{% endif %}
## Your Capabilities
You manage agents using these tools:
- **spawn_agent**: Deploy an agent with a specific task. Be explicit about which tools to use.

View File

@@ -0,0 +1,189 @@
"""
Shadow Graph implementation for GhostCrew.
This module provides a lightweight knowledge graph that is built automatically
from agent notes. It is used by the Orchestrator to compute strategic insights
(e.g., "we have creds for X but haven't scanned it") without burdening the
agents with graph management.
Architecture:
Notes (Source of Truth) -> Shadow Graph (Derived View) -> Insights (Strategic Hints)
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Tuple
import networkx as nx
logger = logging.getLogger(__name__)
@dataclass
class GraphNode:
"""A node in the shadow graph."""
id: str
type: str # host, service, credential, finding, artifact
label: str
metadata: Dict[str, Any] = field(default_factory=dict)
def __hash__(self):
return hash(self.id)
@dataclass
class GraphEdge:
"""An edge in the shadow graph."""
source: str
target: str
type: str # CONNECTS_TO, HAS_SERVICE, AUTH_ACCESS, RELATED_TO
metadata: Dict[str, Any] = field(default_factory=dict)
class ShadowGraph:
"""
A NetworkX-backed knowledge graph that derives its state from notes.
"""
def __init__(self):
self.graph = nx.DiGraph()
self._processed_notes: Set[str] = set()
# Regex patterns for entity extraction
self._ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
self._port_pattern = re.compile(r'(\d{1,5})/(tcp|udp)')
self._user_pattern = re.compile(r'user[:\s]+([a-zA-Z0-9_.-]+)', re.IGNORECASE)
def update_from_notes(self, notes: Dict[str, Dict[str, Any]]) -> None:
"""
Update the graph based on new notes.
This method is idempotent and incremental. It only processes notes
that haven't been seen before (based on key).
"""
for key, note_data in notes.items():
if key in self._processed_notes:
continue
# Handle legacy format
if isinstance(note_data, str):
content = note_data
category = "info"
else:
content = note_data.get("content", "")
category = note_data.get("category", "info")
self._process_note(key, content, category)
self._processed_notes.add(key)
def _process_note(self, key: str, content: str, category: str) -> None:
"""Extract entities and relationships from a single note."""
# 1. Extract IPs (Hosts)
ips = self._ip_pattern.findall(content)
hosts = []
for ip in ips:
node_id = f"host:{ip}"
self._add_node(node_id, "host", ip)
hosts.append(node_id)
# 2. Handle specific categories
if category == "credential":
self._process_credential(key, content, hosts)
elif category == "finding":
self._process_finding(key, content, hosts)
elif category == "vulnerability":
self._process_vulnerability(key, content, hosts)
# 3. Link note to hosts (provenance)
# We don't add the note itself as a node usually, but we could.
# For now, we just use the note to build Host-to-Host or Host-to-Service links.
def _add_node(self, node_id: str, node_type: str, label: str, **kwargs) -> None:
"""Add a node if it doesn't exist."""
if not self.graph.has_node(node_id):
self.graph.add_node(node_id, type=node_type, label=label, **kwargs)
def _add_edge(self, source: str, target: str, edge_type: str, **kwargs) -> None:
"""Add an edge."""
if self.graph.has_node(source) and self.graph.has_node(target):
self.graph.add_edge(source, target, type=edge_type, **kwargs)
def _process_credential(self, key: str, content: str, related_hosts: List[str]) -> None:
"""Process a credential note."""
# Extract username
user_match = self._user_pattern.search(content)
username = user_match.group(1) if user_match else "unknown"
cred_id = f"cred:{key}"
self._add_node(cred_id, "credential", f"Creds ({username})")
# Link cred to hosts it belongs to (or works on)
for host_id in related_hosts:
# If the note says "ssh", assume SSH access
protocol = "ssh" if "ssh" in content.lower() else "unknown"
self._add_edge(cred_id, host_id, "AUTH_ACCESS", protocol=protocol)
def _process_finding(self, key: str, content: str, related_hosts: List[str]) -> None:
"""Process a finding note (e.g., open ports)."""
# Extract ports
ports = self._port_pattern.findall(content)
for port, proto in ports:
for host_id in related_hosts:
service_id = f"service:{host_id}:{port}"
self._add_node(service_id, "service", f"{port}/{proto}")
self._add_edge(host_id, service_id, "HAS_SERVICE", protocol=proto)
def _process_vulnerability(self, key: str, content: str, related_hosts: List[str]) -> None:
"""Process a vulnerability note."""
vuln_id = f"vuln:{key}"
# Try to extract CVE
cve_match = re.search(r'CVE-\d{4}-\d{4,7}', content, re.IGNORECASE)
label = cve_match.group(0) if cve_match else "Vulnerability"
self._add_node(vuln_id, "vulnerability", label)
for host_id in related_hosts:
self._add_edge(host_id, vuln_id, "AFFECTED_BY")
def get_strategic_insights(self) -> List[str]:
"""
Analyze the graph and return natural language insights for the Orchestrator.
"""
insights = []
# Insight 1: Unused Credentials
# Find credentials that have AUTH_ACCESS to a host, but we haven't "explored" that host fully?
# Or simply list valid access paths.
for node, data in self.graph.nodes(data=True):
if data.get("type") == "credential":
# Find what it connects to
targets = [v for u, v in self.graph.out_edges(node)]
if targets:
target_labels = [self.graph.nodes[t].get("label", t) for t in targets]
insights.append(f"We have credentials that provide access to: {', '.join(target_labels)}")
# Insight 2: High Value Targets (Hosts with many open ports/vulns)
for node, data in self.graph.nodes(data=True):
if data.get("type") == "host":
# Count services
services = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "service"]
vulns = [v for u, v in self.graph.out_edges(node) if self.graph.nodes[v].get("type") == "vulnerability"]
if len(services) > 0 or len(vulns) > 0:
insights.append(f"Host {data['label']} has {len(services)} services and {len(vulns)} known vulnerabilities.")
# Insight 3: Potential Pivots (Host A -> Cred -> Host B)
# This is harder without explicit "source" of creds, but we can infer.
return insights
def export_summary(self) -> str:
"""Export a text summary of the graph state."""
stats = {
"hosts": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'host']),
"creds": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'credential']),
"vulns": len([n for n, d in self.graph.nodes(data=True) if d['type'] == 'vulnerability']),
}
return f"Graph State: {stats['hosts']} Hosts, {stats['creds']} Credentials, {stats['vulns']} Vulnerabilities"

View File

@@ -18,6 +18,10 @@ httpx>=0.27.0
# RAG / Embeddings
numpy>=1.26.0
sentence-transformers>=2.7.0
# Graph
networkx>=3.3
faiss-cpu>=1.8.0
# Docker