mirror of
https://github.com/GH05TCREW/pentestagent.git
synced 2026-03-07 14:23:20 +00:00
- Introduce command for CLI and TUI with create/activate, list, info, note, clear, export, import, and help actions - Persist workspace state via marker and enriched (targets, operator notes, last_active_at, last_target) - Restore on workspace activation and sync it to UI banner, agent state, and CLI output - Enforce target normalization and ensure always exists in workspace targets - Route loot output to when a workspace is active - Prefer workspace-local knowledge paths for indexing and RAG resolution - Persist RAG indexes per workspace and load existing indexes before re-indexing - Add deterministic workspace export/import utilities (excluding caches) - Integrate workspace handling into TUI slash commands with modal help screen
512 lines
17 KiB
Python
512 lines
17 KiB
Python
"""RAG (Retrieval Augmented Generation) engine for PentestAgent."""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import numpy as np
|
|
|
|
from .embeddings import get_embeddings
|
|
from ..workspaces.utils import resolve_knowledge_paths
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
"""A chunk of knowledge."""
|
|
|
|
content: str
|
|
source: str
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
embedding: Optional[np.ndarray] = None
|
|
doc_id: Optional[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.metadata is None:
|
|
self.metadata = {}
|
|
if self.doc_id is None:
|
|
self.doc_id = f"{hash(self.content)}_{hash(self.source)}"
|
|
|
|
|
|
class RAGEngine:
|
|
"""Vector search over security knowledge."""
|
|
|
|
def __init__(
|
|
self,
|
|
knowledge_path: Path = Path("knowledge"),
|
|
embedding_model: str = "text-embedding-3-small",
|
|
use_local_embeddings: bool = False,
|
|
):
|
|
"""
|
|
Initialize the RAG engine.
|
|
|
|
Args:
|
|
knowledge_path: Path to the knowledge directory
|
|
embedding_model: Model to use for embeddings
|
|
use_local_embeddings: Whether to use local embeddings (sentence-transformers)
|
|
"""
|
|
self.knowledge_path = knowledge_path
|
|
self.embedding_model = embedding_model
|
|
self.use_local_embeddings = use_local_embeddings
|
|
self.documents: List[Document] = []
|
|
self.embeddings: Optional[np.ndarray] = None
|
|
self._indexed = False
|
|
self._source_files: set = set() # Track unique source files
|
|
|
|
def index(self, force: bool = False):
|
|
"""
|
|
Index all documents in knowledge directory.
|
|
|
|
Args:
|
|
force: Force re-indexing even if already indexed
|
|
"""
|
|
if self._indexed and not force:
|
|
return
|
|
|
|
chunks = []
|
|
self._source_files = set() # Reset source file tracking
|
|
|
|
# Resolve knowledge paths (prefer workspace if available)
|
|
if self.knowledge_path != Path("knowledge"):
|
|
sources_base = self.knowledge_path
|
|
kp = None
|
|
else:
|
|
kp = resolve_knowledge_paths()
|
|
sources_base = kp.get("sources", Path("knowledge"))
|
|
|
|
# If workspace has a persisted index and we're not forcing reindex, try to load it
|
|
try:
|
|
if kp and kp.get("using_workspace"):
|
|
emb_dir = kp.get("embeddings")
|
|
emb_dir.mkdir(parents=True, exist_ok=True)
|
|
idx_path = emb_dir / "index.pkl"
|
|
if idx_path.exists() and not force:
|
|
try:
|
|
self.load_index(idx_path)
|
|
return
|
|
except Exception:
|
|
# Fall through to re-index if loading fails
|
|
pass
|
|
except Exception:
|
|
# Non-fatal — continue to index from sources
|
|
pass
|
|
|
|
# Process all files in knowledge directory
|
|
if sources_base.exists():
|
|
for file in sources_base.rglob("*"):
|
|
if not file.is_file():
|
|
continue
|
|
|
|
try:
|
|
if file.suffix in [".txt", ".md"]:
|
|
self._source_files.add(str(file))
|
|
content = file.read_text(encoding="utf-8", errors="ignore")
|
|
file_chunks = self._chunk_text(content, source=str(file))
|
|
chunks.extend(file_chunks)
|
|
|
|
elif file.suffix == ".json":
|
|
self._source_files.add(str(file))
|
|
data = json.loads(file.read_text(encoding="utf-8"))
|
|
if isinstance(data, list):
|
|
for item in data:
|
|
chunks.append(
|
|
Document(
|
|
content=json.dumps(item, indent=2),
|
|
source=str(file),
|
|
metadata=(
|
|
item
|
|
if isinstance(item, dict)
|
|
else {"data": item}
|
|
),
|
|
)
|
|
)
|
|
else:
|
|
chunks.append(
|
|
Document(
|
|
content=json.dumps(data, indent=2),
|
|
source=str(file),
|
|
metadata=(
|
|
data
|
|
if isinstance(data, dict)
|
|
else {"data": data}
|
|
),
|
|
)
|
|
)
|
|
except Exception as e:
|
|
print(f"[RAG] Error processing {file}: {e}")
|
|
|
|
self.documents = chunks
|
|
|
|
# Generate embeddings
|
|
if chunks:
|
|
texts = [doc.content for doc in chunks]
|
|
|
|
if self.use_local_embeddings:
|
|
from .embeddings import get_embeddings_local
|
|
|
|
self.embeddings = get_embeddings_local(texts)
|
|
else:
|
|
self.embeddings = get_embeddings(texts, model=self.embedding_model)
|
|
|
|
# Store embeddings in documents
|
|
for i, doc in enumerate(self.documents):
|
|
doc.embedding = self.embeddings[i]
|
|
|
|
self._indexed = True
|
|
# If using a workspace, persist the built index for faster future loads
|
|
try:
|
|
if kp and kp.get("using_workspace") and self.embeddings is not None:
|
|
emb_dir = kp.get("embeddings")
|
|
emb_dir.mkdir(parents=True, exist_ok=True)
|
|
idx_path = emb_dir / "index.pkl"
|
|
try:
|
|
self.save_index(idx_path)
|
|
except Exception:
|
|
# ignore save failures
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
def _chunk_text(
|
|
self, text: str, source: str, chunk_size: int = 1000, overlap: int = 200
|
|
) -> List[Document]:
|
|
"""
|
|
Split text into overlapping chunks.
|
|
|
|
Args:
|
|
text: The text to split
|
|
source: The source file path
|
|
chunk_size: Maximum chunk size in characters
|
|
overlap: Overlap between chunks
|
|
|
|
Returns:
|
|
List of Document objects
|
|
"""
|
|
chunks = []
|
|
|
|
# Split by paragraphs first for better context
|
|
paragraphs = text.split("\n\n")
|
|
current_chunk = ""
|
|
|
|
for para in paragraphs:
|
|
if len(current_chunk) + len(para) + 2 <= chunk_size:
|
|
current_chunk += para + "\n\n"
|
|
else:
|
|
if current_chunk.strip():
|
|
chunks.append(
|
|
Document(content=current_chunk.strip(), source=source)
|
|
)
|
|
current_chunk = para + "\n\n"
|
|
|
|
# Add the last chunk
|
|
if current_chunk.strip():
|
|
chunks.append(Document(content=current_chunk.strip(), source=source))
|
|
|
|
# If no paragraphs were found, fall back to simple chunking
|
|
if not chunks and text.strip():
|
|
start = 0
|
|
while start < len(text):
|
|
end = start + chunk_size
|
|
chunk = text[start:end]
|
|
|
|
if chunk.strip():
|
|
chunks.append(Document(content=chunk.strip(), source=source))
|
|
|
|
start = end - overlap
|
|
|
|
return chunks
|
|
|
|
def search(
|
|
self, query: str, k: int = 5, threshold: float = 0.35, max_tokens: int = 1500
|
|
) -> List[str]:
|
|
"""
|
|
Find relevant documents for a query.
|
|
|
|
Args:
|
|
query: The search query
|
|
k: Maximum number of results to return
|
|
threshold: Minimum similarity threshold
|
|
max_tokens: Maximum total tokens to return (prevents context bloat)
|
|
|
|
Returns:
|
|
List of relevant document contents
|
|
"""
|
|
# Guard against empty/invalid queries
|
|
if not query or not isinstance(query, str) or not query.strip():
|
|
return []
|
|
|
|
if not self._indexed:
|
|
self.index()
|
|
|
|
if not self.documents or self.embeddings is None:
|
|
return []
|
|
|
|
# Get query embedding
|
|
if self.use_local_embeddings:
|
|
from .embeddings import get_embeddings_local
|
|
|
|
query_embedding = get_embeddings_local([query])[0]
|
|
else:
|
|
query_embedding = get_embeddings([query], model=self.embedding_model)[0]
|
|
|
|
# Compute cosine similarities
|
|
similarities = np.dot(self.embeddings, query_embedding) / (
|
|
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
|
|
+ 1e-10
|
|
)
|
|
|
|
# Get top k indices above threshold
|
|
indices_above_threshold = np.where(similarities >= threshold)[0]
|
|
|
|
if len(indices_above_threshold) > 0:
|
|
# Sort by similarity (descending) and take top k
|
|
sorted_indices = indices_above_threshold[
|
|
np.argsort(similarities[indices_above_threshold])[::-1]
|
|
]
|
|
top_indices = sorted_indices[:k]
|
|
else:
|
|
# No results above threshold - return empty rather than irrelevant content
|
|
return []
|
|
|
|
# Collect results up to max_tokens budget
|
|
results = []
|
|
total_tokens = 0
|
|
for idx in top_indices:
|
|
content = self.documents[idx].content
|
|
# Rough token estimate: ~4 chars per token
|
|
chunk_tokens = len(content) // 4
|
|
if total_tokens + chunk_tokens > max_tokens and results:
|
|
# Stop if we'd exceed budget (but always include at least one)
|
|
break
|
|
results.append(content)
|
|
total_tokens += chunk_tokens
|
|
|
|
return results
|
|
|
|
def search_with_scores(
|
|
self, query: str, k: int = 5, threshold: float = 0.35
|
|
) -> List[tuple[Document, float]]:
|
|
"""
|
|
Search with similarity scores.
|
|
|
|
Args:
|
|
query: The search query
|
|
k: Maximum number of results to return
|
|
threshold: Minimum similarity threshold
|
|
|
|
Returns:
|
|
List of (Document, score) tuples above threshold
|
|
"""
|
|
if not self._indexed:
|
|
self.index()
|
|
|
|
if not self.documents or self.embeddings is None:
|
|
return []
|
|
|
|
# Get query embedding
|
|
if self.use_local_embeddings:
|
|
from .embeddings import get_embeddings_local
|
|
|
|
query_embedding = get_embeddings_local([query])[0]
|
|
else:
|
|
query_embedding = get_embeddings([query], model=self.embedding_model)[0]
|
|
|
|
# Compute cosine similarities
|
|
similarities = np.dot(self.embeddings, query_embedding) / (
|
|
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
|
|
+ 1e-10
|
|
)
|
|
|
|
# Get top k above threshold
|
|
indices_above_threshold = np.where(similarities >= threshold)[0]
|
|
|
|
if len(indices_above_threshold) > 0:
|
|
sorted_indices = indices_above_threshold[
|
|
np.argsort(similarities[indices_above_threshold])[::-1]
|
|
]
|
|
top_indices = sorted_indices[:k]
|
|
else:
|
|
# Fallback: return single best result even if below threshold
|
|
top_indices = [np.argmax(similarities)]
|
|
|
|
return [(self.documents[i], float(similarities[i])) for i in top_indices]
|
|
|
|
def add_document(
|
|
self, content: str, source: str = "user", metadata: Optional[dict] = None
|
|
):
|
|
"""
|
|
Add a document to the knowledge base.
|
|
|
|
Args:
|
|
content: The document content
|
|
source: The source identifier
|
|
metadata: Optional metadata
|
|
"""
|
|
doc = Document(content=content, source=source, metadata=metadata)
|
|
|
|
# Generate embedding
|
|
if self.use_local_embeddings:
|
|
from .embeddings import get_embeddings_local
|
|
|
|
new_embedding = get_embeddings_local([content])
|
|
else:
|
|
new_embedding = get_embeddings([content], model=self.embedding_model)
|
|
|
|
doc.embedding = new_embedding[0]
|
|
self.documents.append(doc)
|
|
|
|
# Update embeddings array
|
|
if self.embeddings is not None:
|
|
self.embeddings = np.vstack([self.embeddings, new_embedding])
|
|
else:
|
|
self.embeddings = new_embedding
|
|
|
|
def add_documents(self, documents: List[Document]):
|
|
"""
|
|
Add multiple documents to the knowledge base.
|
|
|
|
Args:
|
|
documents: List of Document objects to add
|
|
"""
|
|
if not documents:
|
|
return
|
|
|
|
texts = [doc.content for doc in documents]
|
|
|
|
if self.use_local_embeddings:
|
|
from .embeddings import get_embeddings_local
|
|
|
|
new_embeddings = get_embeddings_local(texts)
|
|
else:
|
|
new_embeddings = get_embeddings(texts, model=self.embedding_model)
|
|
|
|
for i, doc in enumerate(documents):
|
|
doc.embedding = new_embeddings[i]
|
|
self.documents.append(doc)
|
|
|
|
if self.embeddings is not None:
|
|
self.embeddings = np.vstack([self.embeddings, new_embeddings])
|
|
else:
|
|
self.embeddings = new_embeddings
|
|
|
|
def remove_document(self, doc_id: str) -> bool:
|
|
"""
|
|
Remove a document by ID.
|
|
|
|
Args:
|
|
doc_id: The document ID to remove
|
|
|
|
Returns:
|
|
True if removed, False if not found
|
|
"""
|
|
for i, doc in enumerate(self.documents):
|
|
if doc.doc_id == doc_id:
|
|
self.documents.pop(i)
|
|
if self.embeddings is not None:
|
|
self.embeddings = np.delete(self.embeddings, i, axis=0)
|
|
return True
|
|
return False
|
|
|
|
def clear(self):
|
|
"""Clear all documents and embeddings."""
|
|
self.documents.clear()
|
|
self.embeddings = None
|
|
self._indexed = False
|
|
self._source_files = set()
|
|
|
|
def get_document_count(self) -> int:
|
|
"""Get the number of source files indexed."""
|
|
return len(self._source_files)
|
|
|
|
def get_chunk_count(self) -> int:
|
|
"""Get the number of indexed chunks (internal document segments)."""
|
|
return len(self.documents)
|
|
|
|
def save_index(self, path: Path):
|
|
"""
|
|
Save the index to disk.
|
|
|
|
Args:
|
|
path: Path to save the index
|
|
"""
|
|
import pickle
|
|
|
|
data = {
|
|
"documents": [
|
|
{
|
|
"content": doc.content,
|
|
"source": doc.source,
|
|
"metadata": doc.metadata,
|
|
"doc_id": doc.doc_id,
|
|
}
|
|
for doc in self.documents
|
|
],
|
|
"embeddings": self.embeddings,
|
|
}
|
|
|
|
with open(path, "wb") as f:
|
|
pickle.dump(data, f)
|
|
|
|
def save_index_to_workspace(self, root: Optional[Path] = None, filename: str = "index.pkl"):
|
|
"""
|
|
Convenience helper to save the index into the active workspace embeddings path.
|
|
|
|
Args:
|
|
root: Optional project root to resolve workspaces (defaults to cwd)
|
|
filename: Filename to use for the saved index
|
|
"""
|
|
from pathlib import Path as _P
|
|
|
|
kp = resolve_knowledge_paths(root=root)
|
|
emb_dir = kp.get("embeddings")
|
|
emb_dir.mkdir(parents=True, exist_ok=True)
|
|
path = _P(emb_dir) / filename
|
|
self.save_index(path)
|
|
|
|
def load_index(self, path: Path):
|
|
"""
|
|
Load the index from disk.
|
|
|
|
Args:
|
|
path: Path to load the index from
|
|
"""
|
|
import pickle
|
|
|
|
with open(path, "rb") as f:
|
|
data = pickle.load(f)
|
|
|
|
self.documents = [
|
|
Document(
|
|
content=d["content"],
|
|
source=d["source"],
|
|
metadata=d["metadata"],
|
|
doc_id=d["doc_id"],
|
|
)
|
|
for d in data["documents"]
|
|
]
|
|
self.embeddings = data["embeddings"]
|
|
|
|
# Restore embeddings in documents
|
|
if self.embeddings is not None:
|
|
for i, doc in enumerate(self.documents):
|
|
doc.embedding = self.embeddings[i]
|
|
|
|
self._indexed = True
|
|
|
|
def load_index_from_workspace(self, root: Optional[Path] = None, filename: str = "index.pkl"):
|
|
"""
|
|
Convenience helper to load the index from the active workspace embeddings path.
|
|
|
|
Args:
|
|
root: Optional project root to resolve workspaces (defaults to cwd)
|
|
filename: Filename used for the saved index
|
|
"""
|
|
from pathlib import Path as _P
|
|
|
|
kp = resolve_knowledge_paths(root=root)
|
|
emb_dir = kp.get("embeddings")
|
|
path = _P(emb_dir) / filename
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"Workspace index not found: {path}")
|
|
self.load_index(path)
|