Files
pentestagent/pentestagent/knowledge/rag.py
giveen e8ab673a13 feat(workspaces): add unified /workspace lifecycle, target persistence, and workspace-scoped RAG
- Introduce  command for CLI and TUI with create/activate, list, info, note, clear, export, import, and help actions
- Persist workspace state via  marker and enriched  (targets, operator notes, last_active_at, last_target)
- Restore  on workspace activation and sync it to UI banner, agent state, and CLI output
- Enforce target normalization and ensure  always exists in workspace targets
- Route loot output to  when a workspace is active
- Prefer workspace-local knowledge paths for indexing and RAG resolution
- Persist RAG indexes per workspace and load existing indexes before re-indexing
- Add deterministic workspace export/import utilities (excluding caches)
- Integrate workspace handling into TUI slash commands with modal help screen
2026-01-19 08:41:38 -07:00

512 lines
17 KiB
Python

"""RAG (Retrieval Augmented Generation) engine for PentestAgent."""
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
import numpy as np
from .embeddings import get_embeddings
from ..workspaces.utils import resolve_knowledge_paths
@dataclass
class Document:
"""A chunk of knowledge."""
content: str
source: str
metadata: Optional[Dict[str, Any]] = None
embedding: Optional[np.ndarray] = None
doc_id: Optional[str] = None
def __post_init__(self):
if self.metadata is None:
self.metadata = {}
if self.doc_id is None:
self.doc_id = f"{hash(self.content)}_{hash(self.source)}"
class RAGEngine:
"""Vector search over security knowledge."""
def __init__(
self,
knowledge_path: Path = Path("knowledge"),
embedding_model: str = "text-embedding-3-small",
use_local_embeddings: bool = False,
):
"""
Initialize the RAG engine.
Args:
knowledge_path: Path to the knowledge directory
embedding_model: Model to use for embeddings
use_local_embeddings: Whether to use local embeddings (sentence-transformers)
"""
self.knowledge_path = knowledge_path
self.embedding_model = embedding_model
self.use_local_embeddings = use_local_embeddings
self.documents: List[Document] = []
self.embeddings: Optional[np.ndarray] = None
self._indexed = False
self._source_files: set = set() # Track unique source files
def index(self, force: bool = False):
"""
Index all documents in knowledge directory.
Args:
force: Force re-indexing even if already indexed
"""
if self._indexed and not force:
return
chunks = []
self._source_files = set() # Reset source file tracking
# Resolve knowledge paths (prefer workspace if available)
if self.knowledge_path != Path("knowledge"):
sources_base = self.knowledge_path
kp = None
else:
kp = resolve_knowledge_paths()
sources_base = kp.get("sources", Path("knowledge"))
# If workspace has a persisted index and we're not forcing reindex, try to load it
try:
if kp and kp.get("using_workspace"):
emb_dir = kp.get("embeddings")
emb_dir.mkdir(parents=True, exist_ok=True)
idx_path = emb_dir / "index.pkl"
if idx_path.exists() and not force:
try:
self.load_index(idx_path)
return
except Exception:
# Fall through to re-index if loading fails
pass
except Exception:
# Non-fatal — continue to index from sources
pass
# Process all files in knowledge directory
if sources_base.exists():
for file in sources_base.rglob("*"):
if not file.is_file():
continue
try:
if file.suffix in [".txt", ".md"]:
self._source_files.add(str(file))
content = file.read_text(encoding="utf-8", errors="ignore")
file_chunks = self._chunk_text(content, source=str(file))
chunks.extend(file_chunks)
elif file.suffix == ".json":
self._source_files.add(str(file))
data = json.loads(file.read_text(encoding="utf-8"))
if isinstance(data, list):
for item in data:
chunks.append(
Document(
content=json.dumps(item, indent=2),
source=str(file),
metadata=(
item
if isinstance(item, dict)
else {"data": item}
),
)
)
else:
chunks.append(
Document(
content=json.dumps(data, indent=2),
source=str(file),
metadata=(
data
if isinstance(data, dict)
else {"data": data}
),
)
)
except Exception as e:
print(f"[RAG] Error processing {file}: {e}")
self.documents = chunks
# Generate embeddings
if chunks:
texts = [doc.content for doc in chunks]
if self.use_local_embeddings:
from .embeddings import get_embeddings_local
self.embeddings = get_embeddings_local(texts)
else:
self.embeddings = get_embeddings(texts, model=self.embedding_model)
# Store embeddings in documents
for i, doc in enumerate(self.documents):
doc.embedding = self.embeddings[i]
self._indexed = True
# If using a workspace, persist the built index for faster future loads
try:
if kp and kp.get("using_workspace") and self.embeddings is not None:
emb_dir = kp.get("embeddings")
emb_dir.mkdir(parents=True, exist_ok=True)
idx_path = emb_dir / "index.pkl"
try:
self.save_index(idx_path)
except Exception:
# ignore save failures
pass
except Exception:
pass
def _chunk_text(
self, text: str, source: str, chunk_size: int = 1000, overlap: int = 200
) -> List[Document]:
"""
Split text into overlapping chunks.
Args:
text: The text to split
source: The source file path
chunk_size: Maximum chunk size in characters
overlap: Overlap between chunks
Returns:
List of Document objects
"""
chunks = []
# Split by paragraphs first for better context
paragraphs = text.split("\n\n")
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) + 2 <= chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk.strip():
chunks.append(
Document(content=current_chunk.strip(), source=source)
)
current_chunk = para + "\n\n"
# Add the last chunk
if current_chunk.strip():
chunks.append(Document(content=current_chunk.strip(), source=source))
# If no paragraphs were found, fall back to simple chunking
if not chunks and text.strip():
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip():
chunks.append(Document(content=chunk.strip(), source=source))
start = end - overlap
return chunks
def search(
self, query: str, k: int = 5, threshold: float = 0.35, max_tokens: int = 1500
) -> List[str]:
"""
Find relevant documents for a query.
Args:
query: The search query
k: Maximum number of results to return
threshold: Minimum similarity threshold
max_tokens: Maximum total tokens to return (prevents context bloat)
Returns:
List of relevant document contents
"""
# Guard against empty/invalid queries
if not query or not isinstance(query, str) or not query.strip():
return []
if not self._indexed:
self.index()
if not self.documents or self.embeddings is None:
return []
# Get query embedding
if self.use_local_embeddings:
from .embeddings import get_embeddings_local
query_embedding = get_embeddings_local([query])[0]
else:
query_embedding = get_embeddings([query], model=self.embedding_model)[0]
# Compute cosine similarities
similarities = np.dot(self.embeddings, query_embedding) / (
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
+ 1e-10
)
# Get top k indices above threshold
indices_above_threshold = np.where(similarities >= threshold)[0]
if len(indices_above_threshold) > 0:
# Sort by similarity (descending) and take top k
sorted_indices = indices_above_threshold[
np.argsort(similarities[indices_above_threshold])[::-1]
]
top_indices = sorted_indices[:k]
else:
# No results above threshold - return empty rather than irrelevant content
return []
# Collect results up to max_tokens budget
results = []
total_tokens = 0
for idx in top_indices:
content = self.documents[idx].content
# Rough token estimate: ~4 chars per token
chunk_tokens = len(content) // 4
if total_tokens + chunk_tokens > max_tokens and results:
# Stop if we'd exceed budget (but always include at least one)
break
results.append(content)
total_tokens += chunk_tokens
return results
def search_with_scores(
self, query: str, k: int = 5, threshold: float = 0.35
) -> List[tuple[Document, float]]:
"""
Search with similarity scores.
Args:
query: The search query
k: Maximum number of results to return
threshold: Minimum similarity threshold
Returns:
List of (Document, score) tuples above threshold
"""
if not self._indexed:
self.index()
if not self.documents or self.embeddings is None:
return []
# Get query embedding
if self.use_local_embeddings:
from .embeddings import get_embeddings_local
query_embedding = get_embeddings_local([query])[0]
else:
query_embedding = get_embeddings([query], model=self.embedding_model)[0]
# Compute cosine similarities
similarities = np.dot(self.embeddings, query_embedding) / (
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
+ 1e-10
)
# Get top k above threshold
indices_above_threshold = np.where(similarities >= threshold)[0]
if len(indices_above_threshold) > 0:
sorted_indices = indices_above_threshold[
np.argsort(similarities[indices_above_threshold])[::-1]
]
top_indices = sorted_indices[:k]
else:
# Fallback: return single best result even if below threshold
top_indices = [np.argmax(similarities)]
return [(self.documents[i], float(similarities[i])) for i in top_indices]
def add_document(
self, content: str, source: str = "user", metadata: Optional[dict] = None
):
"""
Add a document to the knowledge base.
Args:
content: The document content
source: The source identifier
metadata: Optional metadata
"""
doc = Document(content=content, source=source, metadata=metadata)
# Generate embedding
if self.use_local_embeddings:
from .embeddings import get_embeddings_local
new_embedding = get_embeddings_local([content])
else:
new_embedding = get_embeddings([content], model=self.embedding_model)
doc.embedding = new_embedding[0]
self.documents.append(doc)
# Update embeddings array
if self.embeddings is not None:
self.embeddings = np.vstack([self.embeddings, new_embedding])
else:
self.embeddings = new_embedding
def add_documents(self, documents: List[Document]):
"""
Add multiple documents to the knowledge base.
Args:
documents: List of Document objects to add
"""
if not documents:
return
texts = [doc.content for doc in documents]
if self.use_local_embeddings:
from .embeddings import get_embeddings_local
new_embeddings = get_embeddings_local(texts)
else:
new_embeddings = get_embeddings(texts, model=self.embedding_model)
for i, doc in enumerate(documents):
doc.embedding = new_embeddings[i]
self.documents.append(doc)
if self.embeddings is not None:
self.embeddings = np.vstack([self.embeddings, new_embeddings])
else:
self.embeddings = new_embeddings
def remove_document(self, doc_id: str) -> bool:
"""
Remove a document by ID.
Args:
doc_id: The document ID to remove
Returns:
True if removed, False if not found
"""
for i, doc in enumerate(self.documents):
if doc.doc_id == doc_id:
self.documents.pop(i)
if self.embeddings is not None:
self.embeddings = np.delete(self.embeddings, i, axis=0)
return True
return False
def clear(self):
"""Clear all documents and embeddings."""
self.documents.clear()
self.embeddings = None
self._indexed = False
self._source_files = set()
def get_document_count(self) -> int:
"""Get the number of source files indexed."""
return len(self._source_files)
def get_chunk_count(self) -> int:
"""Get the number of indexed chunks (internal document segments)."""
return len(self.documents)
def save_index(self, path: Path):
"""
Save the index to disk.
Args:
path: Path to save the index
"""
import pickle
data = {
"documents": [
{
"content": doc.content,
"source": doc.source,
"metadata": doc.metadata,
"doc_id": doc.doc_id,
}
for doc in self.documents
],
"embeddings": self.embeddings,
}
with open(path, "wb") as f:
pickle.dump(data, f)
def save_index_to_workspace(self, root: Optional[Path] = None, filename: str = "index.pkl"):
"""
Convenience helper to save the index into the active workspace embeddings path.
Args:
root: Optional project root to resolve workspaces (defaults to cwd)
filename: Filename to use for the saved index
"""
from pathlib import Path as _P
kp = resolve_knowledge_paths(root=root)
emb_dir = kp.get("embeddings")
emb_dir.mkdir(parents=True, exist_ok=True)
path = _P(emb_dir) / filename
self.save_index(path)
def load_index(self, path: Path):
"""
Load the index from disk.
Args:
path: Path to load the index from
"""
import pickle
with open(path, "rb") as f:
data = pickle.load(f)
self.documents = [
Document(
content=d["content"],
source=d["source"],
metadata=d["metadata"],
doc_id=d["doc_id"],
)
for d in data["documents"]
]
self.embeddings = data["embeddings"]
# Restore embeddings in documents
if self.embeddings is not None:
for i, doc in enumerate(self.documents):
doc.embedding = self.embeddings[i]
self._indexed = True
def load_index_from_workspace(self, root: Optional[Path] = None, filename: str = "index.pkl"):
"""
Convenience helper to load the index from the active workspace embeddings path.
Args:
root: Optional project root to resolve workspaces (defaults to cwd)
filename: Filename used for the saved index
"""
from pathlib import Path as _P
kp = resolve_knowledge_paths(root=root)
emb_dir = kp.get("embeddings")
path = _P(emb_dir) / filename
if not path.exists():
raise FileNotFoundError(f"Workspace index not found: {path}")
self.load_index(path)