mirror of
https://github.com/arc53/DocsGPT.git
synced 2025-11-30 17:13:15 +00:00
Merge branch 'main' into pr/115
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -108,7 +108,7 @@ venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
.flaskenv
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
@@ -1,10 +1,19 @@
|
||||
FROM python:3.9
|
||||
FROM python:3.11-slim-bullseye as builder
|
||||
|
||||
# Tiktoken requires Rust toolchain, so build it in a separate stage
|
||||
RUN apt-get update && apt-get install -y gcc curl
|
||||
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && apt-get install --reinstall libc6-dev -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
RUN pip install --upgrade pip && pip install tiktoken==0.1.2
|
||||
|
||||
FROM python:3.11-slim-bullseye
|
||||
# Copy pre-built packages from builder stage
|
||||
COPY --from=builder /usr/local/lib/python3.11/site-packages/ /usr/local/lib/python3.11/site-packages/
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
ENV FLASK_APP=app.py
|
||||
ENV FLASK_ENV=development
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from langchain import OpenAI, VectorDBQA, HuggingFaceHub, Cohere
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings, CohereEmbeddings, HuggingFaceInstructEmbeddings
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
from error import bad_request
|
||||
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
|
||||
|
||||
if os.getenv("LLM_NAME") is not None:
|
||||
@@ -74,6 +74,7 @@ def api_answer():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
history = data["history"]
|
||||
print('-'*5)
|
||||
if not api_key_set:
|
||||
api_key = data["api_key"]
|
||||
else:
|
||||
@@ -83,62 +84,68 @@ def api_answer():
|
||||
else:
|
||||
embeddings_key = os.getenv("EMBEDDINGS_KEY")
|
||||
|
||||
# use try and except to check for exception
|
||||
try:
|
||||
|
||||
# check if the vectorstore is set
|
||||
if "active_docs" in data:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data['active_docs'] == "default":
|
||||
# check if the vectorstore is set
|
||||
if "active_docs" in data:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data['active_docs'] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
|
||||
# loading the index and the store and the prompt template
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
if embeddings_choice == "openai_text-embedding-ada-002":
|
||||
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
|
||||
elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
|
||||
elif embeddings_choice == "huggingface_hkunlp/instructor-large":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
|
||||
elif embeddings_choice == "cohere_medium":
|
||||
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
|
||||
# loading the index and the store and the prompt template
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
if embeddings_choice == "openai_text-embedding-ada-002":
|
||||
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
|
||||
elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings())
|
||||
elif embeddings_choice == "huggingface_hkunlp/instructor-large":
|
||||
docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings())
|
||||
elif embeddings_choice == "cohere_medium":
|
||||
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
|
||||
|
||||
# create a prompt template
|
||||
if history:
|
||||
history = json.loads(history)
|
||||
template_temp = template_hist.replace("{historyquestion}", history[0]).replace("{historyanswer}", history[1])
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template_temp, template_format="jinja2")
|
||||
else:
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template, template_format="jinja2")
|
||||
# create a prompt template
|
||||
if history:
|
||||
history = json.loads(history)
|
||||
template_temp = template_hist.replace("{historyquestion}", history[0]).replace("{historyanswer}", history[1])
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template_temp, template_format="jinja2")
|
||||
else:
|
||||
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template, template_format="jinja2")
|
||||
|
||||
if llm_choice == "openai":
|
||||
llm = OpenAI(openai_api_key=api_key, temperature=0)
|
||||
elif llm_choice == "manifest":
|
||||
llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
|
||||
elif llm_choice == "huggingface":
|
||||
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
|
||||
elif llm_choice == "cohere":
|
||||
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
|
||||
if llm_choice == "openai":
|
||||
llm = OpenAI(openai_api_key=api_key, temperature=0)
|
||||
elif llm_choice == "manifest":
|
||||
llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
|
||||
elif llm_choice == "huggingface":
|
||||
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
|
||||
elif llm_choice == "cohere":
|
||||
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
|
||||
|
||||
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
|
||||
combine_prompt=c_prompt)
|
||||
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
|
||||
combine_prompt=c_prompt)
|
||||
|
||||
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=4)
|
||||
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=4)
|
||||
|
||||
# fetch the answer
|
||||
result = chain({"query": question})
|
||||
print(result)
|
||||
|
||||
# fetch the answer
|
||||
result = chain({"query": question})
|
||||
print(result)
|
||||
|
||||
# some formatting for the frontend
|
||||
result['answer'] = result['result']
|
||||
result['answer'] = result['answer'].replace("\\n", "<br>")
|
||||
result['answer'] = result['answer'].replace("SOURCES:", "")
|
||||
# mock result
|
||||
# result = {
|
||||
# "answer": "The answer is 42",
|
||||
# "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
|
||||
# }
|
||||
return result
|
||||
# some formatting for the frontend
|
||||
result['answer'] = result['result']
|
||||
result['answer'] = result['answer'].replace("\\n", "<br>")
|
||||
result['answer'] = result['answer'].replace("SOURCES:", "")
|
||||
# mock result
|
||||
# result = {
|
||||
# "answer": "The answer is 42",
|
||||
# "sources": ["https://en.wikipedia.org/wiki/42_(number)", "https://en.wikipedia.org/wiki/42_(number)"]
|
||||
# }
|
||||
return result
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return bad_request(500,str(e))
|
||||
|
||||
|
||||
@app.route("/api/docs_check", methods=["POST"])
|
||||
|
||||
13
application/error.py
Normal file
13
application/error.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from flask import jsonify
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
def response_error(code_status,message=None):
|
||||
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
|
||||
if message:
|
||||
payload['message'] = message
|
||||
response = jsonify(payload)
|
||||
response.status_code = code_status
|
||||
return response
|
||||
|
||||
def bad_request(status_code=400,message=''):
|
||||
return response_error(code_status=status_code,message=message)
|
||||
@@ -64,6 +64,7 @@ transformers==4.26.0
|
||||
typer==0.7.0
|
||||
typing-inspect==0.8.0
|
||||
typing_extensions==4.4.0
|
||||
unstructured==0.4.8
|
||||
urllib3==1.26.14
|
||||
Werkzeug==2.2.3
|
||||
XlsxWriter==3.0.8
|
||||
|
||||
@@ -1,55 +1,73 @@
|
||||
var el = document.getElementById('message-form');
|
||||
if (el) {
|
||||
el.addEventListener("submit", function (event) {
|
||||
console.log("submitting")
|
||||
event.preventDefault()
|
||||
var message = document.getElementById("message-input").value;
|
||||
msg_html = '<div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end"><p class="text-sm">'
|
||||
msg_html += message
|
||||
msg_html += '</p></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
document.getElementById("message-input").value = "";
|
||||
document.getElementById("button-submit").innerHTML = '<i class="fa fa-circle-o-notch fa-spin"></i> Thinking...';
|
||||
document.getElementById("button-submit").disabled = true;
|
||||
if (localStorage.getItem('activeDocs') == null) {
|
||||
localStorage.setItem('activeDocs', 'default')
|
||||
}
|
||||
var form = document.getElementById('message-form');
|
||||
var errorModal = document.getElementById('error-alert')
|
||||
document.getElementById('close').addEventListener('click',()=>{
|
||||
errorModal.classList.toggle('hidden')
|
||||
})
|
||||
|
||||
|
||||
function submitForm(event){
|
||||
event.preventDefault()
|
||||
var message = document.getElementById("message-input").value;
|
||||
console.log(message.length)
|
||||
if(message.length === 0){
|
||||
return
|
||||
}
|
||||
msg_html = '<div class="bg-blue-500 text-white p-2 rounded-lg mb-2 self-end"><p class="text-sm">'
|
||||
msg_html += message
|
||||
msg_html += '</p></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
document.getElementById("message-input").value = "";
|
||||
document.getElementById("button-submit").innerHTML = '<i class="fa fa-circle-o-notch fa-spin"></i> Thinking...';
|
||||
document.getElementById("button-submit").disabled = true;
|
||||
if (localStorage.getItem('activeDocs') == null) {
|
||||
localStorage.setItem('activeDocs', 'default')
|
||||
}
|
||||
|
||||
|
||||
fetch('/api/answer', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
|
||||
body: JSON.stringify({question: message,
|
||||
api_key: localStorage.getItem('apiKey'),
|
||||
embeddings_key: localStorage.getItem('apiKey'),
|
||||
history: localStorage.getItem('chatHistory'),
|
||||
active_docs: localStorage.getItem('activeDocs')}),
|
||||
}).then((response)=> response.json())
|
||||
.then(data => {
|
||||
console.log('Success:', data);
|
||||
if(data.error){
|
||||
document.getElementById('text-error').textContent = `Error : ${JSON.stringify(data.message)}`
|
||||
errorModal.classList.toggle('hidden')
|
||||
}
|
||||
if(data.answer){
|
||||
msg_html = '<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start"><code class="text-sm">'
|
||||
msg_html += data.answer
|
||||
msg_html += '</code></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
}
|
||||
document.getElementById("button-submit").innerHTML = 'Send';
|
||||
document.getElementById("button-submit").disabled = false;
|
||||
let chatHistory = [message, data.answer || ''];
|
||||
localStorage.setItem('chatHistory', JSON.stringify(chatHistory));
|
||||
|
||||
|
||||
|
||||
fetch('/api/answer', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
|
||||
body: JSON.stringify({question: message,
|
||||
api_key: localStorage.getItem('apiKey'),
|
||||
embeddings_key: localStorage.getItem('apiKey'),
|
||||
history: localStorage.getItem('chatHistory'),
|
||||
active_docs: localStorage.getItem('activeDocs')}),
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
console.log('Success:', data);
|
||||
msg_html = '<div class="bg-indigo-500 text-white p-2 rounded-lg mb-2 self-start"><code class="text-sm">'
|
||||
msg_html += data.answer
|
||||
msg_html += '</code></div>'
|
||||
document.getElementById("messages").innerHTML += msg_html;
|
||||
let chatWindow = document.getElementById("messages-container");
|
||||
chatWindow.scrollTop = chatWindow.scrollHeight;
|
||||
document.getElementById("button-submit").innerHTML = 'Send';
|
||||
document.getElementById("button-submit").disabled = false;
|
||||
let chatHistory = [message, data.answer];
|
||||
localStorage.setItem('chatHistory', JSON.stringify(chatHistory));
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error);
|
||||
console.log(error);
|
||||
document.getElementById("button-submit").innerHTML = 'Send';
|
||||
document.getElementById("button-submit").disabled = false;
|
||||
});
|
||||
.catch((error) => {
|
||||
console.error('Error:', error);
|
||||
// console.log(error);
|
||||
// document.getElementById("button-submit").innerHTML = 'Send';
|
||||
// document.getElementById("button-submit").disabled = false;
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
window.addEventListener('submit',submitForm)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="bg-white p-2 flex justify-between items-center">
|
||||
@@ -28,6 +28,17 @@
|
||||
{% endif %}
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
<!-- Alert Info -->
|
||||
<div class="border flex justify-between
|
||||
w-auto px-4 py-3 rounded relative
|
||||
hidden" style="background-color: rgb(197, 51, 51);color: white;" id="error-alert" role="alert">
|
||||
<span class="block sm:inline" id="text-error"></span>
|
||||
<strong class="text-xl align-center alert-del" style="cursor: pointer;" id="close">×</strong>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="lg:flex ml-2 mr-2">
|
||||
<div class="lg:w-3/4 min-h-screen max-h-screen">
|
||||
<div class="w-full flex flex-col h-5/6">
|
||||
@@ -59,6 +70,8 @@ This will return a new DataFrame with all the columns from both tables, and only
|
||||
</form>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
@@ -77,11 +90,16 @@ This will return a new DataFrame with all the columns from both tables, and only
|
||||
</div>
|
||||
|
||||
<div class="flex items-center justify-center h-full">
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
{% if not api_key_set %}
|
||||
<div class="fixed z-10 overflow-y-auto top-0 w-full left-0 hidden" id="modal">
|
||||
|
||||
<div class="fixed z-10 overflow-y-auto top-0 w-full left-0 show" id="modal">
|
||||
<div class="flex items-center justify-center min-height-100vh pt-4 px-4 pb-20 text-center sm:block sm:p-0">
|
||||
<div class="fixed inset-0 transition-opacity">
|
||||
<div class="absolute inset-0 bg-gray-900 opacity-75" />
|
||||
@@ -105,6 +123,9 @@ This will return a new DataFrame with all the columns from both tables, and only
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
||||
|
||||
<script>
|
||||
function docsIndex() {
|
||||
// loads latest index from https://raw.githubusercontent.com/arc53/DocsHUB/main/combined.json
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { useState } from 'react';
|
||||
import { NavLink } from 'react-router-dom';
|
||||
import Arrow1 from './assets/arrow.svg';
|
||||
import Message from './assets/message.svg';
|
||||
import Hamburger from './assets/hamburger.svg';
|
||||
import Key from './assets/key.svg';
|
||||
import Info from './assets/info.svg';
|
||||
@@ -63,6 +64,19 @@ export default function Navigation({
|
||||
/>
|
||||
</button>
|
||||
</div>
|
||||
<NavLink
|
||||
to={'/'}
|
||||
className={({ isActive }) =>
|
||||
`${
|
||||
isActive ? 'bg-gray-3000' : ''
|
||||
} my-auto mx-4 mt-4 flex h-12 cursor-pointer gap-4 rounded-md hover:bg-gray-100`
|
||||
}
|
||||
>
|
||||
<img src={Message} className="ml-2 w-5"></img>
|
||||
<p className="my-auto text-eerie-black">Chat</p>
|
||||
</NavLink>
|
||||
|
||||
<div className="flex-grow border-b-2 border-gray-100"></div>
|
||||
<div className="flex flex-grow flex-col-reverse border-b-2">
|
||||
<div className="relative my-4 px-6 ">
|
||||
<div
|
||||
@@ -107,7 +121,6 @@ export default function Navigation({
|
||||
</div>
|
||||
<p className="ml-6 font-bold text-jet">Source Docs</p>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-2 border-b-2 py-2">
|
||||
<div
|
||||
className="my-auto mx-4 flex h-12 cursor-pointer gap-4 rounded-md hover:bg-gray-100"
|
||||
@@ -123,7 +136,11 @@ export default function Navigation({
|
||||
<div className="flex flex-col gap-2 border-b-2 py-2">
|
||||
<NavLink
|
||||
to="/about"
|
||||
className="my-auto mx-4 flex h-12 cursor-pointer gap-4 rounded-md hover:bg-gray-100"
|
||||
className={({ isActive }) =>
|
||||
`my-auto mx-4 flex h-12 cursor-pointer gap-4 rounded-md hover:bg-gray-100 ${
|
||||
isActive ? 'bg-gray-3000' : ''
|
||||
}`
|
||||
}
|
||||
>
|
||||
<img src={Info} alt="info" className="ml-2 w-5" />
|
||||
<p className="my-auto text-eerie-black">About</p>
|
||||
|
||||
3
frontend/src/assets/message.svg
Normal file
3
frontend/src/assets/message.svg
Normal file
@@ -0,0 +1,3 @@
|
||||
<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M18 0H2C0.9 0 0 0.9 0 2V20L4 16H18C19.1 16 20 15.1 20 14V2C20 0.9 19.1 0 18 0ZM18 14H4L2 16V2H18V14Z" fill="black" fill-opacity="0.54"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 249 B |
@@ -35,20 +35,20 @@ export default function Conversation() {
|
||||
return (
|
||||
<ConversationBubble
|
||||
ref={index === messages.length - 1 ? endMessageRef : null}
|
||||
className="mb-7"
|
||||
className={`${index === messages.length - 1 ? 'mb-20' : 'mb-7'}`}
|
||||
key={index}
|
||||
message={message.text}
|
||||
type={message.type}
|
||||
></ConversationBubble>
|
||||
);
|
||||
})}
|
||||
{messages.length === 0 && <Hero className="mt-24"></Hero>}
|
||||
{messages.length === 0 && <Hero className="mt-24 md:mt-52"></Hero>}
|
||||
</div>
|
||||
<div className="fixed bottom-2 flex w-10/12 md:w-[50%]">
|
||||
<div className="fixed bottom-14 flex w-10/12 md:bottom-12 md:w-[50%]">
|
||||
<div
|
||||
ref={inputRef}
|
||||
contentEditable
|
||||
className={`min-h-5 border-000000 overflow-x-hidden; max-h-24 w-full overflow-y-auto rounded-xl border bg-white p-2 pr-9 opacity-100 focus:border-2 focus:outline-none`}
|
||||
className={`border-000000 overflow-x-hidden; max-h-24 min-h-[2.6rem] w-full overflow-y-auto rounded-xl border bg-white p-2 pr-9 opacity-100 focus:border-2 focus:outline-none`}
|
||||
></div>
|
||||
{status === 'loading' ? (
|
||||
<img
|
||||
@@ -68,6 +68,10 @@ export default function Conversation() {
|
||||
></img>
|
||||
)}
|
||||
</div>
|
||||
<p className="fixed bottom-6 w-10/12 text-center text-xs text-gray-2000">
|
||||
This is a chatbot that uses the GPT-3, Faiss and LangChain to answer
|
||||
questions.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ export default function APIKeyModal({
|
||||
>
|
||||
<article className="mx-auto mt-24 flex w-[90vw] max-w-lg flex-col gap-4 rounded-lg bg-white p-6 shadow-lg">
|
||||
<p className="text-xl text-jet">OpenAI API Key</p>
|
||||
<p className="text-lg leading-5 text-gray-500">
|
||||
<p className="text-md leading-6 text-gray-500">
|
||||
Before you can start using DocsGPT we need you to provide an API key
|
||||
for llm. Currently, we support only OpenAI but soon many more. You can
|
||||
find it here.
|
||||
|
||||
@@ -1,158 +1,132 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { useDispatch, useSelector } from 'react-redux';
|
||||
import { ActiveState } from '../models/misc';
|
||||
import {
|
||||
setSelectedDocs,
|
||||
setSourceDocs,
|
||||
selectSourceDocs,
|
||||
} from './preferenceSlice';
|
||||
import {
|
||||
getDocs,
|
||||
Doc,
|
||||
getLocalRecentDocs,
|
||||
setLocalRecentDocs,
|
||||
} from './preferenceApi';
|
||||
|
||||
export default function APIKeyModal({
|
||||
modalState,
|
||||
setModalState,
|
||||
isCancellable = true,
|
||||
}: {
|
||||
modalState: ActiveState;
|
||||
setModalState: (val: ActiveState) => void;
|
||||
isCancellable?: boolean;
|
||||
}) {
|
||||
const dispatch = useDispatch();
|
||||
const docs = useSelector(selectSourceDocs);
|
||||
const [localSelectedDocs, setLocalSelectedDocs] = useState<Doc | null>(null);
|
||||
const [isDocsListOpen, setIsDocsListOpen] = useState(false);
|
||||
const [isError, setIsError] = useState(false);
|
||||
|
||||
function handleSubmit() {
|
||||
if (!localSelectedDocs) {
|
||||
setIsError(true);
|
||||
} else {
|
||||
setLocalRecentDocs(localSelectedDocs);
|
||||
dispatch(setSelectedDocs(localSelectedDocs));
|
||||
setModalState('INACTIVE');
|
||||
setIsError(false);
|
||||
}
|
||||
}
|
||||
|
||||
function handleCancel() {
|
||||
async function getRecentDocs() {
|
||||
const response = await getLocalRecentDocs();
|
||||
|
||||
if (response) {
|
||||
const parsedResponse = JSON.parse(response) as Doc;
|
||||
setLocalSelectedDocs(parsedResponse);
|
||||
}
|
||||
}
|
||||
|
||||
getRecentDocs();
|
||||
setIsError(false);
|
||||
setModalState('INACTIVE');
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
function getRecentDocs() {
|
||||
const response = getLocalRecentDocs();
|
||||
|
||||
if (response) {
|
||||
const parsedResponse = JSON.parse(response) as Doc;
|
||||
dispatch(setSelectedDocs(parsedResponse));
|
||||
setLocalSelectedDocs(parsedResponse);
|
||||
setModalState('INACTIVE');
|
||||
}
|
||||
}
|
||||
|
||||
async function requestDocs() {
|
||||
const data = await getDocs();
|
||||
dispatch(setSourceDocs(data));
|
||||
}
|
||||
|
||||
getRecentDocs();
|
||||
requestDocs();
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div
|
||||
className={`${
|
||||
modalState === 'ACTIVE' ? 'visible' : 'hidden'
|
||||
} absolute z-30 h-screen w-screen bg-gray-alpha`}
|
||||
>
|
||||
<article className="mx-auto mt-24 flex w-[90vw] max-w-lg flex-col gap-4 rounded-lg bg-white p-6 shadow-lg">
|
||||
<p className="text-xl text-jet">Select Source Documentation</p>
|
||||
<p className="text-lg leading-5 text-gray-500">
|
||||
Please select the library of documentation that you would like to use
|
||||
with our app.
|
||||
</p>
|
||||
<div className="relative">
|
||||
<div
|
||||
className="h-10 w-full cursor-pointer border-b-2"
|
||||
onClick={() => setIsDocsListOpen(!isDocsListOpen)}
|
||||
>
|
||||
{!localSelectedDocs ? (
|
||||
<p className="py-3 text-gray-500">Select</p>
|
||||
) : (
|
||||
<p className="py-3">
|
||||
{localSelectedDocs.name} {localSelectedDocs.version}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
{isDocsListOpen && (
|
||||
<div className="absolute top-10 left-0 max-h-52 w-full overflow-y-scroll bg-white">
|
||||
{docs ? (
|
||||
docs.map((doc, index) => {
|
||||
if (doc.model) {
|
||||
return (
|
||||
<div
|
||||
key={index}
|
||||
onClick={() => {
|
||||
setLocalSelectedDocs(doc);
|
||||
setIsDocsListOpen(false);
|
||||
}}
|
||||
className="h-10 w-full cursor-pointer border-x-2 border-b-2 hover:bg-gray-100"
|
||||
>
|
||||
<p className="ml-5 py-3">
|
||||
{doc.name} {doc.version}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
})
|
||||
) : (
|
||||
<div className="h-10 w-full cursor-pointer border-x-2 border-b-2 hover:bg-gray-100">
|
||||
<p className="ml-5 py-3">No default documentation.</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex flex-row-reverse">
|
||||
{isCancellable && (
|
||||
<button
|
||||
onClick={() => handleCancel()}
|
||||
className="ml-5 h-10 w-20 rounded-lg border border-violet-700 bg-white text-violet-800 transition-all hover:bg-violet-700 hover:text-white"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
onClick={() => {
|
||||
handleSubmit();
|
||||
}}
|
||||
className="ml-auto h-10 w-20 rounded-lg bg-violet-800 text-white transition-all hover:bg-violet-700"
|
||||
>
|
||||
Save
|
||||
</button>
|
||||
{isError && (
|
||||
<p className="mr-auto text-sm text-red-500">
|
||||
Please select source documentation.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
import { useEffect, useState } from 'react';
|
||||
import { useDispatch, useSelector } from 'react-redux';
|
||||
import { ActiveState } from '../models/misc';
|
||||
import {
|
||||
setSelectedDocs,
|
||||
setSourceDocs,
|
||||
selectSourceDocs,
|
||||
} from './preferenceSlice';
|
||||
import { getDocs, Doc } from './selectDocsApi';
|
||||
|
||||
export default function APIKeyModal({
|
||||
modalState,
|
||||
setModalState,
|
||||
isCancellable = true,
|
||||
}: {
|
||||
modalState: ActiveState;
|
||||
setModalState: (val: ActiveState) => void;
|
||||
isCancellable?: boolean;
|
||||
}) {
|
||||
const dispatch = useDispatch();
|
||||
const docs = useSelector(selectSourceDocs);
|
||||
const [localSelectedDocs, setLocalSelectedDocs] = useState<Doc | null>(null);
|
||||
const [isDocsListOpen, setIsDocsListOpen] = useState(false);
|
||||
const [isError, setIsError] = useState(false);
|
||||
|
||||
function handleSubmit() {
|
||||
if (!localSelectedDocs) {
|
||||
setIsError(true);
|
||||
} else {
|
||||
dispatch(setSelectedDocs(localSelectedDocs));
|
||||
setModalState('INACTIVE');
|
||||
setLocalSelectedDocs(null);
|
||||
setIsError(false);
|
||||
}
|
||||
}
|
||||
|
||||
function handleCancel() {
|
||||
setLocalSelectedDocs(null);
|
||||
setIsError(false);
|
||||
setModalState('INACTIVE');
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
async function requestDocs() {
|
||||
const data = await getDocs();
|
||||
dispatch(setSourceDocs(data));
|
||||
}
|
||||
|
||||
requestDocs();
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div
|
||||
className={`${
|
||||
modalState === 'ACTIVE' ? 'visible' : 'hidden'
|
||||
} absolute z-30 h-screen w-screen bg-gray-alpha`}
|
||||
>
|
||||
<article className="mx-auto mt-24 flex w-[90vw] max-w-lg flex-col gap-4 rounded-lg bg-white p-6 shadow-lg">
|
||||
<p className="text-xl text-jet">Select Source Documentation</p>
|
||||
<p className="text-lg leading-5 text-gray-500">
|
||||
Please select the library of documentation that you would like to use
|
||||
with our app.
|
||||
</p>
|
||||
<div className="relative">
|
||||
<div
|
||||
className="h-10 w-full cursor-pointer border-b-2"
|
||||
onClick={() => setIsDocsListOpen(!isDocsListOpen)}
|
||||
>
|
||||
{!localSelectedDocs ? (
|
||||
<p className="py-3 text-gray-500">Select</p>
|
||||
) : (
|
||||
<p className="py-3">
|
||||
{localSelectedDocs.name} {localSelectedDocs.version}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
{isDocsListOpen && (
|
||||
<div className="absolute top-10 left-0 max-h-52 w-full overflow-y-scroll bg-white">
|
||||
{docs ? (
|
||||
docs.map((doc, index) => {
|
||||
if (doc.model) {
|
||||
return (
|
||||
<div
|
||||
key={index}
|
||||
onClick={() => {
|
||||
setLocalSelectedDocs(doc);
|
||||
setIsDocsListOpen(false);
|
||||
}}
|
||||
className="h-10 w-full cursor-pointer border-x-2 border-b-2 hover:bg-gray-100"
|
||||
>
|
||||
<p className="ml-5 py-3">
|
||||
{doc.name} {doc.version}
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
})
|
||||
) : (
|
||||
<div className="h-10 w-full cursor-pointer border-x-2 border-b-2 hover:bg-gray-100">
|
||||
<p className="ml-5 py-3">No default documentation.</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex flex-row-reverse">
|
||||
{isCancellable && (
|
||||
<button
|
||||
onClick={() => handleCancel()}
|
||||
className="ml-5 h-10 w-20 rounded-lg border border-violet-700 bg-white text-violet-800 transition-all hover:bg-violet-700 hover:text-white"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
onClick={() => {
|
||||
handleSubmit();
|
||||
}}
|
||||
className="ml-auto h-10 w-20 rounded-lg bg-violet-800 text-white transition-all hover:bg-violet-700"
|
||||
>
|
||||
Save
|
||||
</button>
|
||||
{isError && (
|
||||
<p className="mr-auto text-sm text-red-500">
|
||||
Please select source documentation.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -12,8 +12,10 @@ module.exports = {
|
||||
jet: '#343541',
|
||||
'gray-alpha': 'rgba(0,0,0, .1)',
|
||||
'gray-1000': '#F6F6F6',
|
||||
'gray-2000': 'rgba(0, 0, 0, 0.5)',
|
||||
'gray-3000': 'rgba(243, 243, 243, 1)',
|
||||
'red-1000': 'rgb(254, 202, 202)',
|
||||
'red-2000' : '#F44336',
|
||||
'red-2000': '#F44336',
|
||||
'red-3000': '#621B16',
|
||||
},
|
||||
},
|
||||
|
||||
126
scripts/code_docs_gen.py
Normal file
126
scripts/code_docs_gen.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from pathlib import Path
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
import faiss
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
import pickle
|
||||
import dotenv
|
||||
import tiktoken
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
import ast
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
ps = list(Path("inputs").glob("**/*.py"))
|
||||
data = []
|
||||
sources = []
|
||||
for p in ps:
|
||||
with open(p) as f:
|
||||
data.append(f.read())
|
||||
sources.append(p)
|
||||
|
||||
|
||||
|
||||
# with open('inputs/client.py', 'r') as f:
|
||||
# tree = ast.parse(f.read())
|
||||
|
||||
# print(tree)
|
||||
|
||||
|
||||
def get_functions_in_class(node):
|
||||
functions = []
|
||||
functions_code = []
|
||||
for child in node.body:
|
||||
if isinstance(child, ast.FunctionDef):
|
||||
functions.append(child.name)
|
||||
functions_code.append(ast.unparse(child))
|
||||
|
||||
return functions, functions_code
|
||||
|
||||
|
||||
def get_classes_and_functions(source_code):
|
||||
tree = ast.parse(source_code)
|
||||
classes = {}
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
class_name = node.name
|
||||
function_name, function = get_functions_in_class(node)
|
||||
# join function name and function code
|
||||
functions = dict(zip(function_name, function))
|
||||
classes[class_name] = functions
|
||||
return classes
|
||||
|
||||
|
||||
structure_dict = {}
|
||||
c1 = 0
|
||||
for code in data:
|
||||
classes = get_classes_and_functions(ast.parse(code))
|
||||
source = str(sources[c1])
|
||||
structure_dict[source] = classes
|
||||
c1 += 1
|
||||
|
||||
# save the structure dict as json
|
||||
import json
|
||||
with open('structure_dict.json', 'w') as f:
|
||||
json.dump(structure_dict, f)
|
||||
|
||||
|
||||
# llm = OpenAI(temperature=0)
|
||||
# prompt = PromptTemplate(
|
||||
# input_variables=["code"],
|
||||
# template="Code: {code}, Documentation: ",
|
||||
# )
|
||||
#
|
||||
# print(prompt.format(code="print('hello world')"))
|
||||
# print(llm(prompt.format(code="print('hello world')")))
|
||||
|
||||
|
||||
if not Path("outputs").exists():
|
||||
Path("outputs").mkdir()
|
||||
|
||||
c1 = len(structure_dict)
|
||||
c2 = 0
|
||||
for source, classes in structure_dict.items():
|
||||
c2 += 1
|
||||
print(f"Processing file {c2}/{c1}")
|
||||
f1 = len(classes)
|
||||
f2 = 0
|
||||
for class_name, functions in classes.items():
|
||||
f2 += 1
|
||||
print(f"Processing class {f2}/{f1}")
|
||||
source_w = source.replace("inputs/", "")
|
||||
source_w = source_w.replace(".py", ".txt")
|
||||
if not Path(f"outputs/{source_w}").exists():
|
||||
with open(f"outputs/{source_w}", "w") as f:
|
||||
f.write(f"Class: {class_name}")
|
||||
else:
|
||||
with open(f"outputs/{source_w}", "a") as f:
|
||||
f.write(f"\n\nClass: {class_name}")
|
||||
# append class name to the front
|
||||
for function in functions:
|
||||
b1 = len(functions)
|
||||
b2 = 0
|
||||
print(f"Processing function {b2}/{b1}")
|
||||
b2 += 1
|
||||
prompt = PromptTemplate(
|
||||
input_variables=["code"],
|
||||
template="Code: \n{code}, \nDocumentation: ",
|
||||
)
|
||||
llm = OpenAI(temperature=0)
|
||||
response = llm(prompt.format(code=functions[function]))
|
||||
|
||||
if not Path(f"outputs/{source_w}").exists():
|
||||
with open(f"outputs/{source_w}", "w") as f:
|
||||
f.write(f"Function: {functions[function]}, \nDocumentation: {response}")
|
||||
else:
|
||||
with open(f"outputs/{source_w}", "a") as f:
|
||||
f.write(f"\n\nFunction: {functions[function]}, \nDocumentation: {response}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
from collections import defaultdict
|
||||
import os
|
||||
import sys
|
||||
import nltk
|
||||
import dotenv
|
||||
import typer
|
||||
import ast
|
||||
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
@@ -12,6 +14,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from parser.file.bulk import SimpleDirectoryReader
|
||||
from parser.schema.base import Document
|
||||
from parser.open_ai_func import call_openai_api, get_user_permission
|
||||
from parser.py2doc import get_classes, get_functions, transform_to_docs
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@@ -37,7 +40,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
help="Maximum number of files to read."),
|
||||
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
|
||||
help="""List of required extensions (list with .)
|
||||
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""),
|
||||
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html"""),
|
||||
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
|
||||
|
||||
"""
|
||||
@@ -78,5 +81,29 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
for directory, folder_name in zip(dir, folder_names):
|
||||
process_one_docs(directory, folder_name)
|
||||
|
||||
|
||||
@app.command()
|
||||
def convert():
|
||||
ps = list(Path("inputs").glob("**/*.py"))
|
||||
data = []
|
||||
sources = []
|
||||
for p in ps:
|
||||
with open(p) as f:
|
||||
data.append(f.read())
|
||||
sources.append(p)
|
||||
|
||||
functions_dict = {}
|
||||
classes_dict = {}
|
||||
c1 = 0
|
||||
for code in data:
|
||||
functions = get_functions(ast.parse(code))
|
||||
source = str(sources[c1])
|
||||
functions_dict[source] = functions
|
||||
classes = get_classes(code)
|
||||
classes_dict[source] = classes
|
||||
c1 += 1
|
||||
|
||||
transform_to_docs(functions_dict, classes_dict)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
167
scripts/outputs/test.md
Normal file
167
scripts/outputs/test.md
Normal file
@@ -0,0 +1,167 @@
|
||||
# Function name: get_functions_in_class
|
||||
|
||||
Function:
|
||||
```
|
||||
def get_functions_in_class(source_code, class_name):
|
||||
tree = ast.parse(source_code)
|
||||
functions = []
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if node.name == class_name:
|
||||
for function in node.body:
|
||||
if isinstance(function, ast.FunctionDef):
|
||||
functions.append(function.name)
|
||||
return functions
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
get_functions_in_class(source_code, class_name)
|
||||
|
||||
Inputs:
|
||||
source_code (str): The source code of the program.
|
||||
class_name (str): The name of the class.
|
||||
|
||||
Outputs:
|
||||
functions (list): A list of the functions in the class.
|
||||
|
||||
Description:
|
||||
This function takes in a source code and a class name and returns a list of the functions in the class. It uses the ast module to parse the source code and find the class definition. It then iterates through the body of the class and checks if each node is a function definition. If it is, it adds the name of the function to the list of functions.
|
||||
|
||||
# Function name: process_functions
|
||||
|
||||
Function:
|
||||
```
|
||||
def process_functions(functions_dict):
|
||||
c1 = len(functions_dict)
|
||||
c2 = 0
|
||||
for (source, functions) in functions_dict.items():
|
||||
c2 += 1
|
||||
print(f'Processing file {c2}/{c1}')
|
||||
f1 = len(functions)
|
||||
f2 = 0
|
||||
source_w = source.replace('inputs/', '')
|
||||
source_w = source_w.replace('.py', '.md')
|
||||
create_subfolder(source_w)
|
||||
for (name, function) in functions.items():
|
||||
f2 += 1
|
||||
print(f'Processing function {f2}/{f1}')
|
||||
response = generate_response(function)
|
||||
write_output_file(source_w, name, function, response)
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
This function takes in a dictionary of functions and processes them. It takes the source file and the functions from the dictionary and creates a subfolder for the source file. It then generates a response for each function and writes the output file. The output file contains the function, the response, and the source file.
|
||||
|
||||
# Function name: get_functions_in_class
|
||||
|
||||
Function:
|
||||
```
|
||||
def get_functions_in_class(source_code, class_name):
|
||||
tree = ast.parse(source_code)
|
||||
functions = []
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if node.name == class_name:
|
||||
for function in node.body:
|
||||
if isinstance(function, ast.FunctionDef):
|
||||
functions.append(function.name)
|
||||
return functions
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
get_functions_in_class(source_code, class_name)
|
||||
|
||||
Inputs:
|
||||
source_code (str): The source code of the program.
|
||||
class_name (str): The name of the class.
|
||||
|
||||
Outputs:
|
||||
functions (list): A list of the functions in the class.
|
||||
|
||||
Description:
|
||||
This function takes in a source code and a class name and returns a list of the functions in the class. It uses the ast module to parse the source code and find the class definition. It then iterates through the body of the class and checks if each node is a function definition. If it is, it adds the name of the function to the list of functions.
|
||||
|
||||
# Function name: process_functions
|
||||
|
||||
Function:
|
||||
```
|
||||
def process_functions(functions_dict):
|
||||
c1 = len(functions_dict)
|
||||
c2 = 0
|
||||
for (source, functions) in functions_dict.items():
|
||||
c2 += 1
|
||||
print(f'Processing file {c2}/{c1}')
|
||||
f1 = len(functions)
|
||||
f2 = 0
|
||||
source_w = source.replace('inputs/', '')
|
||||
source_w = source_w.replace('.py', '.md')
|
||||
create_subfolder(source_w)
|
||||
for (name, function) in functions.items():
|
||||
f2 += 1
|
||||
print(f'Processing function {f2}/{f1}')
|
||||
response = generate_response(function)
|
||||
write_output_file(source_w, name, function, response)
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
This function takes in a dictionary of functions and processes them. It takes the source file and the functions from the dictionary and creates a subfolder for the source file. It then generates a response for each function and writes the output file for each function.
|
||||
|
||||
# Function name: get_functions_in_class
|
||||
|
||||
Function:
|
||||
```
|
||||
def get_functions_in_class(source_code, class_name):
|
||||
tree = ast.parse(source_code)
|
||||
functions = []
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if node.name == class_name:
|
||||
for function in node.body:
|
||||
if isinstance(function, ast.FunctionDef):
|
||||
functions.append(function.name)
|
||||
return functions
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
get_functions_in_class(source_code, class_name)
|
||||
|
||||
Inputs:
|
||||
source_code (str): The source code of the program.
|
||||
class_name (str): The name of the class.
|
||||
|
||||
Outputs:
|
||||
functions (list): A list of the functions in the class.
|
||||
|
||||
Description:
|
||||
This function takes in a source code and a class name and returns a list of the functions in the class. It uses the ast module to parse the source code and find the class definition. It then iterates through the body of the class and checks if each node is a function definition. If it is, it adds the name of the function to the list of functions.
|
||||
|
||||
# Function name: process_functions
|
||||
|
||||
Function:
|
||||
```
|
||||
def process_functions(functions_dict):
|
||||
c1 = len(functions_dict)
|
||||
c2 = 0
|
||||
for (source, functions) in functions_dict.items():
|
||||
c2 += 1
|
||||
print(f'Processing file {c2}/{c1}')
|
||||
f1 = len(functions)
|
||||
f2 = 0
|
||||
source_w = source.replace('inputs/', '')
|
||||
source_w = source_w.replace('.py', '.md')
|
||||
create_subfolder(source_w)
|
||||
for (name, function) in functions.items():
|
||||
f2 += 1
|
||||
print(f'Processing function {f2}/{f1}')
|
||||
response = generate_response(function)
|
||||
write_output_file(source_w, name, function, response)
|
||||
```,
|
||||
Documentation:
|
||||
|
||||
|
||||
This function takes in a dictionary of functions and processes them. It takes the source file and the functions from the dictionary and creates a subfolder for the source file. It then generates a response for each function and writes the output file for each function.
|
||||
@@ -7,6 +7,7 @@ from parser.file.base import BaseReader
|
||||
from parser.file.base_parser import BaseParser
|
||||
from parser.file.docs_parser import DocxParser, PDFParser
|
||||
from parser.file.epub_parser import EpubParser
|
||||
from parser.file.html_parser import HTMLParser
|
||||
from parser.file.markdown_parser import MarkdownParser
|
||||
from parser.file.rst_parser import RstParser
|
||||
from parser.file.tabular_parser import PandasCSVParser
|
||||
@@ -19,6 +20,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
|
||||
".epub": EpubParser(),
|
||||
".md": MarkdownParser(),
|
||||
".rst": RstParser(),
|
||||
".html": HTMLParser(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
82
scripts/parser/file/html_parser.py
Normal file
82
scripts/parser/file/html_parser.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""HTML parser.
|
||||
|
||||
Contains parser for html files.
|
||||
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
class HTMLParser(BaseParser):
|
||||
"""HTML parser."""
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
return {}
|
||||
|
||||
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
|
||||
"""Parse file.
|
||||
|
||||
Returns:
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
import unstructured
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
with open(file, "r", encoding="utf-8") as fp:
|
||||
elements = partition_html(file=fp)
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
|
||||
|
||||
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||||
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
|
||||
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||||
for isd_el in isd:
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
|
||||
|
||||
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||||
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
|
||||
# Creating 'Chunks' - List of lists of strings
|
||||
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||||
# Each Chunk can be thought of as an individual set of data, which can be sent to the model
|
||||
# Where Each Title is grouped together with the data under it
|
||||
|
||||
Chunks = [[]]
|
||||
final_chunks = list(list())
|
||||
|
||||
for i,isd_el in enumerate(isd):
|
||||
if i in title_indexes:
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of lenth of all the strings in the chunk
|
||||
sum = 0
|
||||
sum += len(str(chunk))
|
||||
if sum < 25:
|
||||
Chunks.remove(chunk)
|
||||
else :
|
||||
# appending all the approved chunks to final_chunks as a single string
|
||||
final_chunks.append(" ".join([str(item) for item in chunk]))
|
||||
return final_chunks
|
||||
155
scripts/parser/py2doc.py
Normal file
155
scripts/parser/py2doc.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from pathlib import Path
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
import dotenv
|
||||
import ast
|
||||
import typer
|
||||
import tiktoken
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
def get_functions(source_code):
|
||||
tree = ast.parse(source_code)
|
||||
functions = {}
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
functions[node.name] = ast.unparse(node)
|
||||
|
||||
return functions
|
||||
|
||||
def get_functions_names(node):
|
||||
functions = []
|
||||
for child in node.body:
|
||||
if isinstance(child, ast.FunctionDef):
|
||||
functions.append(child.name)
|
||||
return functions
|
||||
|
||||
|
||||
|
||||
def get_classes(source_code):
|
||||
tree = ast.parse(source_code)
|
||||
classes = {}
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
classes[node.name] = get_functions_names(node)
|
||||
return classes
|
||||
|
||||
def get_functions_in_class(source_code, class_name):
|
||||
tree = ast.parse(source_code)
|
||||
functions = []
|
||||
for node in tree.body:
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if node.name == class_name:
|
||||
for function in node.body:
|
||||
if isinstance(function, ast.FunctionDef):
|
||||
functions.append(function.name)
|
||||
return functions
|
||||
|
||||
|
||||
def parse_functions(functions_dict):
|
||||
c1 = len(functions_dict)
|
||||
c2 = 0
|
||||
for source, functions in functions_dict.items():
|
||||
c2 += 1
|
||||
print(f"Processing file {c2}/{c1}")
|
||||
f1 = len(functions)
|
||||
f2 = 0
|
||||
source_w = source.replace("inputs/", "")
|
||||
source_w = source_w.replace(".py", ".md")
|
||||
# this is how we check subfolders
|
||||
if "/" in source_w:
|
||||
subfolders = source_w.split("/")
|
||||
subfolders = subfolders[:-1]
|
||||
subfolders = "/".join(subfolders)
|
||||
if not Path(f"outputs/{subfolders}").exists():
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True)
|
||||
|
||||
for name, function in functions.items():
|
||||
f2 += 1
|
||||
print(f"Processing function {f2}/{f1}")
|
||||
prompt = PromptTemplate(
|
||||
input_variables=["code"],
|
||||
template="Code: \n{code}, \nDocumentation: ",
|
||||
)
|
||||
llm = OpenAI(temperature=0)
|
||||
response = llm(prompt.format(code=function))
|
||||
|
||||
if not Path(f"outputs/{source_w}").exists():
|
||||
with open(f"outputs/{source_w}", "w") as f:
|
||||
f.write(f"# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
else:
|
||||
with open(f"outputs/{source_w}", "a") as f:
|
||||
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
def parse_classes(classes_dict):
|
||||
c1 = len(classes_dict)
|
||||
c2 = 0
|
||||
for source, classes in classes_dict.items():
|
||||
c2 += 1
|
||||
print(f"Processing file {c2}/{c1}")
|
||||
f1 = len(classes)
|
||||
f2 = 0
|
||||
source_w = source.replace("inputs/", "")
|
||||
source_w = source_w.replace(".py", ".md")
|
||||
|
||||
if "/" in source_w:
|
||||
subfolders = source_w.split("/")
|
||||
subfolders = subfolders[:-1]
|
||||
subfolders = "/".join(subfolders)
|
||||
if not Path(f"outputs/{subfolders}").exists():
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True)
|
||||
|
||||
for name, function_names in classes.items():
|
||||
print(f"Processing Class {f2}/{f1}")
|
||||
f2 += 1
|
||||
prompt = PromptTemplate(
|
||||
input_variables=["class_name", "functions_names"],
|
||||
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
|
||||
)
|
||||
llm = OpenAI(temperature=0)
|
||||
response = llm(prompt.format(class_name=name, functions_names=function_names))
|
||||
|
||||
if not Path(f"outputs/{source_w}").exists():
|
||||
with open(f"outputs/{source_w}", "w") as f:
|
||||
f.write(f"# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
|
||||
else:
|
||||
with open(f"outputs/{source_w}", "a") as f:
|
||||
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
#User permission
|
||||
def transform_to_docs(functions_dict, classes_dict):
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert dicts to a string and calculate the number of OpenAI tokens the string represents.
|
||||
docs_content = ""
|
||||
for key, value in functions_dict.items():
|
||||
docs_content += str(key) + str(value)
|
||||
for key, value in classes_dict.items():
|
||||
docs_content += str(key) + str(value)
|
||||
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
||||
num_tokens = len(encoding.encode(docs_content))
|
||||
total_price = ((num_tokens / 1000) * 0.02)
|
||||
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(num_tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
#Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
if not Path("outputs").exists():
|
||||
Path("outputs").mkdir()
|
||||
parse_functions(functions_dict)
|
||||
print("Functions done!")
|
||||
parse_classes(classes_dict)
|
||||
print("All done!")
|
||||
elif user_input == "":
|
||||
if not Path("outputs").exists():
|
||||
Path("outputs").mkdir()
|
||||
parse_functions(functions_dict)
|
||||
print("Functions done!")
|
||||
parse_classes(classes_dict)
|
||||
print("All done!")
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
Reference in New Issue
Block a user