pentestagent/rag_split.py

#from curses import color_content
from ollama import chat,Message
from ollama import embeddings
import os
import json
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set numpy print options to display full arrays
np.set_printoptions(threshold=np.inf)

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  # Use the standard OpenAI API key env variable
    base_url=os.getenv("OPENAI_BASE_URL")  # Read base_url from environment variable
)

import os # Added for directory operations

class Kb:
    def __init__(self, dirpath):  # Read all documents in the directory
        all_content = ""
        if not os.path.isdir(dirpath):
            print(f"Error: {dirpath} is not a valid directory.")
            self.docs = []
            self.embedss = np.array([])
            return

        for filename in os.listdir(dirpath):
            filepath = os.path.join(dirpath, filename)
            if os.path.isfile(filepath):
                try:
                    with open(filepath, 'r', encoding="utf-8") as f:
                        all_content += f.read() + "\n"  # Add a newline to separate file contents
                except Exception as e:
                    print(f"Error reading file {filepath}: {e}")

        if not all_content.strip():
            print(f"Warning: No content found in directory {dirpath}.")
            self.docs = []
            self.embedss = np.array([])
            return

        self.docs = self.split_content(all_content)  # Split all document content after merging
        if self.docs:
            self.embedss = self.encode(self.docs)
        else:
            self.embedss = np.array([])

    @staticmethod
    def split_content(content,max_length=5000):
        chuncks=[]
        for i in range(0,len(content),max_length):
            chuncks.append(content[i:i+max_length])
        return chuncks


    def encode(self,texts):
          embeds=[]
          for text in texts:
            completion = client.embeddings.create(
            model="text-embedding-ada-002",
            input=text,
            encoding_format="float"
            )
            response_json = completion.model_dump_json()
            embedding_data = json.loads(response_json)
            embedding_array = embedding_data['data'][0]['embedding']
            embeds.append(embedding_array)
          return np.array(embeds)


    @staticmethod #similarity
    def similarity(A,B):
        dot_product=np.dot(A,B)
        norm_A=np.linalg.norm(A)
        norm_B=np.linalg.norm(B)
        similarity=dot_product/(norm_A*norm_B)
        return similarity

    def search(self,query):
        max_similarity=0
        max_similarity_index=0
        query_embedding=self.encode([query])[0]
        for idx,te in enumerate(self.embedss):
            similarity=self.similarity(query_embedding,te)
            if similarity>max_similarity:
                max_similarity=similarity
                max_similarity_index=idx
        return self.docs[max_similarity_index]


if __name__ == "__main__":
    # Example usage: Create a dummy directory and file for testing
    test_kb_dir = "knowledge_test"
    if not os.path.exists(test_kb_dir):
        os.makedirs(test_kb_dir)
    with open(os.path.join(test_kb_dir, "test_doc.txt"), 'w', encoding='utf-8') as f:
        f.write("This is a test document for security audit information.")

    kb = Kb(test_kb_dir)
    if kb.docs: # Check if docs were loaded
        #for doc in kb.docs:
        # print("========================================================")
        # print(doc)

        #for e in kb.embedss:
        # print(e)
        result = kb.search("security audit")
        print(f"Search result: {result}")
    else:
        print("Knowledge base is empty or failed to load.")

    # Clean up dummy directory and file
    # import shutil
    # if os.path.exists(test_kb_dir):
    #     shutil.rmtree(test_kb_dir)