From 2c364d3c007a282b73199c314b54de8347d4ec7c Mon Sep 17 00:00:00 2001 From: Pavel Date: Sat, 25 Feb 2023 17:37:33 +0400 Subject: [PATCH] Code_to_dict 3 languages added, works well with python. Java and Js require additional revieving --- scripts/ingest.py | 46 ++++----- scripts/parser/java2doc.py | 61 ++++++++++++ scripts/parser/js2doc.py | 67 +++++++++++++ scripts/parser/py2doc.py | 194 +++++++++++++++---------------------- 4 files changed, 228 insertions(+), 140 deletions(-) create mode 100644 scripts/parser/java2doc.py create mode 100644 scripts/parser/js2doc.py diff --git a/scripts/ingest.py b/scripts/ingest.py index 4fca0e79..d5477d13 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -14,7 +14,11 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from parser.file.bulk import SimpleDirectoryReader from parser.schema.base import Document from parser.open_ai_func import call_openai_api, get_user_permission -from parser.py2doc import get_classes, get_functions, transform_to_docs +from parser.py2doc import transform_to_docs +from parser.py2doc import extract_functions_and_classes as extract_py +from parser.js2doc import extract_functions_and_classes as extract_js +from parser.java2doc import extract_functions_and_classes as extract_java + dotenv.load_dotenv() @@ -83,27 +87,25 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, @app.command() -def convert(): - ps = list(Path("inputs").glob("**/*.py")) - data = [] - sources = [] - for p in ps: - with open(p) as f: - data.append(f.read()) - sources.append(p) - - functions_dict = {} - classes_dict = {} - c1 = 0 - for code in data: - functions = get_functions(ast.parse(code)) - source = str(sources[c1]) - functions_dict[source] = functions - classes = get_classes(code) - classes_dict[source] = classes - c1 += 1 - - transform_to_docs(functions_dict, classes_dict) +def convert(dir: Optional[str] = typer.Option("inputs", + help="""Path to directory to make documentation for. + E.g. --dir inputs """), + formats: Optional[str] = typer.Option("py", + help="""Required language. + py, js, java supported for now""")): + """ + Creates documentation linked to original functions from specified location. + By default /inputs folder is used, .py is parsed. + """ + if formats == 'py': + functions_dict, classes_dict = extract_py(dir) + elif formats == 'js': + functions_dict, classes_dict = extract_js(dir) + elif formats == 'java': + functions_dict, classes_dict = extract_java(dir) + else: + raise Exception("Sorry, language not supported yet") + transform_to_docs(functions_dict, classes_dict, formats, dir) if __name__ == "__main__": app() diff --git a/scripts/parser/java2doc.py b/scripts/parser/java2doc.py new file mode 100644 index 00000000..c1701c5d --- /dev/null +++ b/scripts/parser/java2doc.py @@ -0,0 +1,61 @@ +import os +import javalang + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.java'): + files_list.append(os.path.join(root, file)) + return files_list + +def extract_functions(file_path): + with open(file_path, "r") as file: + java_code = file.read() + methods = {} + tree = javalang.parse.parse(java_code) + for _, node in tree.filter(javalang.tree.MethodDeclaration): + method_name = node.name + start_line = node.position.line - 1 + end_line = start_line + brace_count = 0 + for line in java_code.splitlines()[start_line:]: + end_line += 1 + brace_count += line.count("{") - line.count("}") + if brace_count == 0: + break + method_source_code = "\n".join(java_code.splitlines()[start_line:end_line]) + methods[method_name] = method_source_code + return methods + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = javalang.parse.parse(source_code) + for class_decl in tree.types: + class_name = class_decl.name + declarations = [] + methods = [] + for field_decl in class_decl.fields: + field_name = field_decl.declarators[0].name + field_type = field_decl.type.name + declarations.append(f"{field_type} {field_name}") + for method_decl in class_decl.methods: + methods.append(method_decl.name) + class_string = "Declarations: " + ", ".join(declarations) + "\n Method name: " + ", ".join(methods) + classes[class_name] = class_string + return classes + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict \ No newline at end of file diff --git a/scripts/parser/js2doc.py b/scripts/parser/js2doc.py new file mode 100644 index 00000000..d434ab23 --- /dev/null +++ b/scripts/parser/js2doc.py @@ -0,0 +1,67 @@ +import os +import esprima +import escodegen + + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.js'): + files_list.append(os.path.join(root, file)) + return files_list + +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'FunctionDeclaration': + func_name = node.id.name if node.id else '' + functions[func_name] = escodegen.generate(node) + elif node.type == 'VariableDeclaration': + for declaration in node.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + elif node.type == 'ClassDeclaration': + class_name = node.id.name + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + func_name = subnode.key.name + functions[func_name] = escodegen.generate(subnode.value) + elif subnode.type == 'VariableDeclaration': + for declaration in subnode.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + return functions + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'ClassDeclaration': + class_name = node.id.name + function_names = [] + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + function_names.append(subnode.key.name) + classes[class_name] = ", ".join(function_names) + return classes + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict diff --git a/scripts/parser/py2doc.py b/scripts/parser/py2doc.py index b5a37c24..4ac73cd9 100644 --- a/scripts/parser/py2doc.py +++ b/scripts/parser/py2doc.py @@ -1,108 +1,87 @@ +import os +import ast +import tiktoken from pathlib import Path from langchain.llms import OpenAI from langchain.prompts import PromptTemplate -import dotenv -import ast -import typer -import tiktoken -dotenv.load_dotenv() - -def get_functions(source_code): - tree = ast.parse(source_code) - functions = {} - for node in tree.body: - if isinstance(node, ast.FunctionDef): - functions[node.name] = ast.unparse(node) +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.py'): + files_list.append(os.path.join(root, file)) + return files_list +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + func_name = node.name + func_def = ast.get_source_segment(source_code, node) + functions[func_name] = func_def return functions -def get_functions_names(node): - functions = [] - for child in node.body: - if isinstance(child, ast.FunctionDef): - functions.append(child.name) - return functions - - - -def get_classes(source_code): - tree = ast.parse(source_code) - classes = {} - for node in tree.body: - if isinstance(node, ast.ClassDef): - classes[node.name] = get_functions_names(node) +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_name = node.name + function_names = [] + for subnode in ast.walk(node): + if isinstance(subnode, ast.FunctionDef): + function_names.append(subnode.name) + classes[class_name] = ", ".join(function_names) return classes -def get_functions_in_class(source_code, class_name): - tree = ast.parse(source_code) - functions = [] - for node in tree.body: - if isinstance(node, ast.ClassDef): - if node.name == class_name: - for function in node.body: - if isinstance(function, ast.FunctionDef): - functions.append(function.name) - return functions +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict - -def parse_functions(functions_dict): +def parse_functions(functions_dict, formats, dir): c1 = len(functions_dict) - c2 = 0 - for source, functions in functions_dict.items(): - c2 += 1 - print(f"Processing file {c2}/{c1}") - f1 = len(functions) - f2 = 0 - source_w = source.replace("inputs/", "") - source_w = source_w.replace(".py", ".md") - # this is how we check subfolders - if "/" in source_w: - subfolders = source_w.split("/") - subfolders = subfolders[:-1] - subfolders = "/".join(subfolders) - if not Path(f"outputs/{subfolders}").exists(): - Path(f"outputs/{subfolders}").mkdir(parents=True) - - for name, function in functions.items(): - f2 += 1 - print(f"Processing function {f2}/{f1}") + for i, (source, functions) in enumerate(functions_dict.items(), start=1): + print(f"Processing file {i}/{c1}") + source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) + for j, (name, function) in enumerate(functions.items(), start=1): + print(f"Processing function {j}/{len(functions)}") prompt = PromptTemplate( input_variables=["code"], template="Code: \n{code}, \nDocumentation: ", ) llm = OpenAI(temperature=0) response = llm(prompt.format(code=function)) - - if not Path(f"outputs/{source_w}").exists(): - with open(f"outputs/{source_w}", "w") as f: - f.write(f"# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") - else: - with open(f"outputs/{source_w}", "a") as f: - f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") + mode = "a" if Path(f"outputs/{source_w}").exists() else "w" + with open(f"outputs/{source_w}", mode) as f: + f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") -def parse_classes(classes_dict): +def parse_classes(classes_dict, formats, dir): c1 = len(classes_dict) - c2 = 0 - for source, classes in classes_dict.items(): - c2 += 1 - print(f"Processing file {c2}/{c1}") - f1 = len(classes) - f2 = 0 - source_w = source.replace("inputs/", "") - source_w = source_w.replace(".py", ".md") - - if "/" in source_w: - subfolders = source_w.split("/") - subfolders = subfolders[:-1] - subfolders = "/".join(subfolders) - if not Path(f"outputs/{subfolders}").exists(): - Path(f"outputs/{subfolders}").mkdir(parents=True) - + for i, (source, classes) in enumerate(classes_dict.items()): + print(f"Processing file {i+1}/{c1}") + source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) for name, function_names in classes.items(): - print(f"Processing Class {f2}/{f1}") - f2 += 1 + print(f"Processing Class {i+1}/{c1}") prompt = PromptTemplate( input_variables=["class_name", "functions_names"], template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ", @@ -110,46 +89,25 @@ def parse_classes(classes_dict): llm = OpenAI(temperature=0) response = llm(prompt.format(class_name=name, functions_names=function_names)) - if not Path(f"outputs/{source_w}").exists(): - with open(f"outputs/{source_w}", "w") as f: - f.write(f"# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") - else: - with open(f"outputs/{source_w}", "a") as f: - f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") + with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f: + f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") +def transform_to_docs(functions_dict, classes_dict, formats, dir): + docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()]) + docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()]) -#User permission -def transform_to_docs(functions_dict, classes_dict): -# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. - # Here we convert dicts to a string and calculate the number of OpenAI tokens the string represents. - docs_content = "" - for key, value in functions_dict.items(): - docs_content += str(key) + str(value) - for key, value in classes_dict.items(): - docs_content += str(key) + str(value) - - encoding = tiktoken.get_encoding("cl100k_base") - num_tokens = len(encoding.encode(docs_content)) + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content)) total_price = ((num_tokens / 1000) * 0.02) - # Here we print the number of tokens and the approx user cost with some visually appealing formatting. - print(f"Number of Tokens = {format(num_tokens, ',d')}") - print(f"Approx Cost = ${format(total_price, ',.2f')}") - #Here we check for user permission before calling the API. - user_input = input("Price Okay? (Y/N) \n").lower() - if user_input == "y": + print(f"Number of Tokens = {num_tokens:,d}") + print(f"Approx Cost = ${total_price:,.2f}") + + user_input = input("Price Okay? (Y/N)\n").lower() + if user_input == "y" or user_input == "": if not Path("outputs").exists(): Path("outputs").mkdir() - parse_functions(functions_dict) - print("Functions done!") - parse_classes(classes_dict) - print("All done!") - elif user_input == "": - if not Path("outputs").exists(): - Path("outputs").mkdir() - parse_functions(functions_dict) - print("Functions done!") - parse_classes(classes_dict) + parse_functions(functions_dict, formats, dir) + parse_classes(classes_dict, formats, dir) print("All done!") else: print("The API was not called. No money was spent.") \ No newline at end of file