* validate python formatting on every build with Ruff
* fix lint warnings
This commit is contained in:
Anton Larin
2023-05-13 10:36:17 +02:00
committed by GitHub
parent 168648e789
commit 962becb9a5
35 changed files with 277 additions and 252 deletions

View File

@@ -1,17 +1,10 @@
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import pickle
import dotenv
import tiktoken
import sys
from argparse import ArgumentParser
import ast
import json
from pathlib import Path
import dotenv
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
dotenv.load_dotenv()
@@ -24,12 +17,6 @@ for p in ps:
sources.append(p)
# with open('inputs/client.py', 'r') as f:
# tree = ast.parse(f.read())
# print(tree)
def get_functions_in_class(node):
functions = []
functions_code = []
@@ -66,16 +53,6 @@ for code in data:
with open('structure_dict.json', 'w') as f:
json.dump(structure_dict, f)
# llm = OpenAI(temperature=0)
# prompt = PromptTemplate(
# input_variables=["code"],
# template="Code: {code}, Documentation: ",
# )
#
# print(prompt.format(code="print('hello world')"))
# print(llm(prompt.format(code="print('hello world')")))
if not Path("outputs").exists():
Path("outputs").mkdir()

View File

@@ -1,19 +1,19 @@
import os
import sys
import nltk
import dotenv
import typer
from collections import defaultdict
from typing import List, Optional
import dotenv
import nltk
import typer
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import transform_to_docs
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.java2doc import extract_functions_and_classes as extract_java
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.py2doc import transform_to_docs
from parser.schema.base import Document
from parser.token_func import group_split
dotenv.load_dotenv()
@@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
Currently supported:
.rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
sample: Optional[bool] = typer.Option(False,
help="Whether to output sample of the first 5 split documents."),
@@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
# docs = text_splitter.split_documents(raw_docs)
# Sample feature
if sample == True:
if sample:
for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text)

View File

@@ -1,38 +1,42 @@
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
import dotenv
import tiktoken
import sys
from argparse import ArgumentParser
from pathlib import Path
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
def call_openai_api():
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api()
@@ -41,7 +45,8 @@ def get_user_permission():
else:
print("The API was not called. No money was spent.")
#Load .env file
# Load .env file
dotenv.load_dotenv()
ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")

View File

@@ -1,71 +1,75 @@
import os
import pickle
import dotenv
import tiktoken
import sys
import faiss
import shutil
import sys
from argparse import ArgumentParser
from pathlib import Path
from langchain.vectorstores import FAISS
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from sphinx.cmd.build import main as sphinx_main
from argparse import ArgumentParser
def convert_rst_to_txt(src_dir, dst_dir):
# Check if the source directory exists
if not os.path.exists(src_dir):
raise Exception("Source directory does not exist")
# Walk through the source directory
for root, dirs, files in os.walk(src_dir):
for file in files:
# Check if the file has .rst extension
if file.endswith(".rst"):
# Construct the full path of the file
src_file = os.path.join(root, file.replace(".rst", ""))
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={src_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
# Check if the source directory exists
if not os.path.exists(src_dir):
raise Exception("Source directory does not exist")
# Walk through the source directory
for root, dirs, files in os.walk(src_dir):
for file in files:
# Check if the file has .rst extension
if file.endswith(".rst"):
# Construct the full path of the file
src_file = os.path.join(root, file.replace(".rst", ""))
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={src_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004)
total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price
def call_openai_api():
# Function to create a vector store from the documents and save it to disk.
# Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API.
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api()
@@ -74,6 +78,7 @@ def get_user_permission():
else:
print("The API was not called. No money was spent.")
ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
ap.add_argument("-i", "--inputs",
type=str,
@@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
help="Directory containing documentation files")
args = ap.parse_args()
#Load .env file
# Load .env file
dotenv.load_dotenv()
#Directory to vector
# Directory to vector
src_dir = args.inputs
dst_dir = "tmp"
convert_rst_to_txt(src_dir, dst_dir)
# Here we load in the data in the format that Notion exports it in.
ps = list(Path("tmp/"+ src_dir).glob("**/*.txt"))
ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
# parse all child directories
data = []

View File

@@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document

View File

@@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
@@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be a user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):

View File

@@ -5,10 +5,9 @@ Contains parser for md files.
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):

View File

@@ -1,4 +1,5 @@
import os
import javalang

View File

@@ -1,6 +1,7 @@
import os
import esprima
import escodegen
import esprima
def find_files(directory):
@@ -27,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body:
if subnode.type == 'MethodDefinition':
func_name = subnode.key.name

View File

@@ -1,16 +1,15 @@
import os
import faiss
import pickle
import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings
from retry import retry
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.

View File

@@ -1,7 +1,8 @@
import os
import ast
import tiktoken
import os
from pathlib import Path
import tiktoken
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

View File

@@ -2,7 +2,6 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument

View File

@@ -1,9 +1,9 @@
import re
import tiktoken
from typing import List
from parser.schema.base import Document
from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
def separate_header_and_body(text):
@@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False:
if not token_check:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
return documents