diff --git a/.gitignore b/.gitignore index d7747efb..ac5ff190 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +**/*.ipynb # IPython profile_default/ diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py index abb2f67c..bd1fa21f 100644 --- a/application/api/answer/routes.py +++ b/application/api/answer/routes.py @@ -29,12 +29,15 @@ prompts_collection = db["prompts"] api_key_collection = db["api_keys"] answer = Blueprint('answer', __name__) -if settings.LLM_NAME == "gpt4": - gpt_model = 'gpt-4' +gpt_model = "" +# to have some kind of default behaviour +if settings.LLM_NAME == "openai": + gpt_model = 'gpt-3.5-turbo' elif settings.LLM_NAME == "anthropic": gpt_model = 'claude-2' -else: - gpt_model = 'gpt-3.5-turbo' + +if settings.MODEL_NAME: # in case there is particular model name configured + gpt_model = settings.MODEL_NAME # load the prompts current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -102,9 +105,8 @@ def is_azure_configured(): return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME -def complete_stream(question, docsearch, chat_history, prompt_id, conversation_id): +def complete_stream(question, docsearch, chat_history, prompt_id, conversation_id, chunks=2): llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=settings.API_KEY) - if prompt_id == 'default': prompt = chat_combine_template elif prompt_id == 'creative': @@ -113,8 +115,11 @@ def complete_stream(question, docsearch, chat_history, prompt_id, conversation_i prompt = chat_combine_strict else: prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"] - - docs = docsearch.search(question, k=2) + + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) if settings.LLM_NAME == "llama.cpp": docs = [docs[0]] # join all page_content together with a newline @@ -202,6 +207,10 @@ def stream(): prompt_id = data["prompt_id"] else: prompt_id = 'default' + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 # check if active_docs is set @@ -218,7 +227,8 @@ def stream(): complete_stream(question, docsearch, chat_history=history, prompt_id=prompt_id, - conversation_id=conversation_id), mimetype="text/event-stream" + conversation_id=conversation_id, + chunks=chunks), mimetype="text/event-stream" ) @@ -239,6 +249,10 @@ def api_answer(): prompt_id = data["prompt_id"] else: prompt_id = 'default' + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 if prompt_id == 'default': prompt = chat_combine_template @@ -266,7 +280,10 @@ def api_answer(): - docs = docsearch.search(question, k=2) + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) # join all page_content together with a newline docs_together = "\n".join([doc.page_content for doc in docs]) p_chat_combine = prompt.replace("{summaries}", docs_together) @@ -361,9 +378,15 @@ def api_search(): vectorstore = get_vectorstore({"active_docs": data["active_docs"]}) else: vectorstore = "" + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, settings.EMBEDDINGS_KEY) - - docs = docsearch.search(question, k=2) + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) source_log_docs = [] for doc in docs: diff --git a/application/core/settings.py b/application/core/settings.py index 84073b7d..0e1909e6 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -9,6 +9,7 @@ current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__ class Settings(BaseSettings): LLM_NAME: str = "docsgpt" + MODEL_NAME: Optional[str] = None # when LLM_NAME is openai, MODEL_NAME can be e.g. gpt-4-turbo-preview or gpt-3.5-turbo EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" diff --git a/application/parser/remote/reddit_loader.py b/application/parser/remote/reddit_loader.py new file mode 100644 index 00000000..0230653a --- /dev/null +++ b/application/parser/remote/reddit_loader.py @@ -0,0 +1,26 @@ +from application.parser.remote.base import BaseRemote +from langchain_community.document_loaders import RedditPostsLoader + + +class RedditPostsLoaderRemote(BaseRemote): + def load_data(self, inputs): + data = eval(inputs) + client_id = data.get("client_id") + client_secret = data.get("client_secret") + user_agent = data.get("user_agent") + categories = data.get("categories", ["new", "hot"]) + mode = data.get("mode", "subreddit") + search_queries = data.get("search_queries") + number_posts = data.get("number_posts", 10) + self.loader = RedditPostsLoader( + client_id=client_id, + client_secret=client_secret, + user_agent=user_agent, + categories=categories, + mode=mode, + search_queries=search_queries, + number_posts=number_posts, + ) + documents = self.loader.load() + print(f"Loaded {len(documents)} documents from Reddit") + return documents diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index e45333d4..d2a58f8d 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -1,13 +1,15 @@ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader +from application.parser.remote.reddit_loader import RedditPostsLoaderRemote class RemoteCreator: loaders = { - 'url': WebLoader, - 'sitemap': SitemapLoader, - 'crawler': CrawlerLoader + "url": WebLoader, + "sitemap": SitemapLoader, + "crawler": CrawlerLoader, + "reddit": RedditPostsLoaderRemote, } @classmethod @@ -15,4 +17,4 @@ class RemoteCreator: loader_class = cls.loaders.get(type.lower()) if not loader_class: raise ValueError(f"No LLM class found for type {type}") - return loader_class(*args, **kwargs) \ No newline at end of file + return loader_class(*args, **kwargs) diff --git a/application/worker.py b/application/worker.py index 21bb319f..3891fde9 100644 --- a/application/worker.py +++ b/application/worker.py @@ -15,23 +15,27 @@ from application.parser.schema.base import Document from application.parser.token_func import group_split try: - nltk.download('punkt', quiet=True) - nltk.download('averaged_perceptron_tagger', quiet=True) + nltk.download("punkt", quiet=True) + nltk.download("averaged_perceptron_tagger", quiet=True) except FileExistsError: pass # Define a function to extract metadata from a given filename. def metadata_from_filename(title): - store = '/'.join(title.split('/')[1:3]) - return {'title': title, 'store': store} + store = "/".join(title.split("/")[1:3]) + return {"title": title, "store": store} # Define a function to generate a random string of a given length. def generate_random_string(length): - return ''.join([string.ascii_letters[i % 52] for i in range(length)]) + return "".join([string.ascii_letters[i % 52] for i in range(length)]) + + +current_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) -current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Define the main function for ingesting and processing documents. def ingest_worker(self, directory, formats, name_job, filename, user): @@ -62,38 +66,52 @@ def ingest_worker(self, directory, formats, name_job, filename, user): token_check = True min_tokens = 150 max_tokens = 1250 - full_path = directory + '/' + user + '/' + name_job + full_path = directory + "/" + user + "/" + name_job import sys + print(full_path, file=sys.stderr) # check if API_URL env variable is set - file_data = {'name': name_job, 'file': filename, 'user': user} - response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data) + file_data = {"name": name_job, "file": filename, "user": user} + response = requests.get( + urljoin(settings.API_URL, "/api/download"), params=file_data + ) # check if file is in the response print(response, file=sys.stderr) file = response.content if not os.path.exists(full_path): os.makedirs(full_path) - with open(full_path + '/' + filename, 'wb') as f: + with open(full_path + "/" + filename, "wb") as f: f.write(file) # check if file is .zip and extract it - if filename.endswith('.zip'): - with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: + if filename.endswith(".zip"): + with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref: zip_ref.extractall(full_path) - os.remove(full_path + '/' + filename) + os.remove(full_path + "/" + filename) - self.update_state(state='PROGRESS', meta={'current': 1}) + self.update_state(state="PROGRESS", meta={"current": 1}) - raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, - required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data() - raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + raw_docs = SimpleDirectoryReader( + input_dir=full_path, + input_files=input_files, + recursive=recursive, + required_exts=formats, + num_files_limit=limit, + exclude_hidden=exclude, + file_metadata=metadata_from_filename, + ).load_data() + raw_docs = group_split( + documents=raw_docs, + min_tokens=min_tokens, + max_tokens=max_tokens, + token_check=token_check, + ) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) - self.update_state(state='PROGRESS', meta={'current': 100}) + self.update_state(state="PROGRESS", meta={"current": 100}) if sample: for i in range(min(5, len(raw_docs))): @@ -101,70 +119,80 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - file_data = {'name': name_job, 'user': user} + file_data = {"name": name_job, "user": user} if settings.VECTOR_STORE == "faiss": - files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), - 'file_pkl': open(full_path + '/index.pkl', 'rb')} - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) - response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + files = { + "file_faiss": open(full_path + "/index.faiss", "rb"), + "file_pkl": open(full_path + "/index.pkl", "rb"), + } + response = requests.post( + urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data + ) + response = requests.get( + urljoin(settings.API_URL, "/api/delete_old?path=" + full_path) + ) else: - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + response = requests.post( + urljoin(settings.API_URL, "/api/upload_index"), data=file_data + ) - # delete local shutil.rmtree(full_path) return { - 'directory': directory, - 'formats': formats, - 'name_job': name_job, - 'filename': filename, - 'user': user, - 'limited': False + "directory": directory, + "formats": formats, + "name_job": name_job, + "filename": filename, + "user": user, + "limited": False, } -def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'): + +def remote_worker(self, source_data, name_job, user, loader, directory="temp"): # sample = False token_check = True min_tokens = 150 max_tokens = 1250 - full_path = directory + '/' + user + '/' + name_job + full_path = directory + "/" + user + "/" + name_job if not os.path.exists(full_path): os.makedirs(full_path) - self.update_state(state='PROGRESS', meta={'current': 1}) - + self.update_state(state="PROGRESS", meta={"current": 1}) + # source_data {"data": [url]} for url type task just urls - + # Use RemoteCreator to load data from URL remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) - docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + docs = group_split( + documents=raw_docs, + min_tokens=min_tokens, + max_tokens=max_tokens, + token_check=token_check, + ) - #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) - self.update_state(state='PROGRESS', meta={'current': 100}) - - + self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function - file_data = {'name': name_job, 'user': user} + file_data = {"name": name_job, "user": user} if settings.VECTOR_STORE == "faiss": - files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), - 'file_pkl': open(full_path + '/index.pkl', 'rb')} - requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + files = { + "file_faiss": open(full_path + "/index.faiss", "rb"), + "file_pkl": open(full_path + "/index.pkl", "rb"), + } + requests.post( + urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data + ) requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) else: requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) shutil.rmtree(full_path) - return { - 'urls': source_data, - 'name_job': name_job, - 'user': user, - 'limited': False - } \ No newline at end of file + return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} diff --git a/docs/package-lock.json b/docs/package-lock.json index dc608479..e2c0f301 100644 --- a/docs/package-lock.json +++ b/docs/package-lock.json @@ -7,7 +7,7 @@ "license": "MIT", "dependencies": { "@vercel/analytics": "^1.1.1", - "docsgpt": "^0.3.0", + "docsgpt": "^0.3.7", "next": "^14.0.4", "nextra": "^2.13.2", "nextra-theme-docs": "^2.13.2", @@ -422,6 +422,11 @@ "node": ">=6.9.0" } }, + "node_modules/@bpmn-io/snarkdown": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@bpmn-io/snarkdown/-/snarkdown-2.2.0.tgz", + "integrity": "sha512-bVD7FIoaBDZeCJkMRgnBPDeptPlto87wt2qaCjf5t8iLaevDmTPaREd6FpBEGsHlUdHFFZWRk4qAoEC5So2M0Q==" + }, "node_modules/@braintree/sanitize-url": { "version": "6.0.4", "resolved": "https://registry.npmjs.org/@braintree/sanitize-url/-/sanitize-url-6.0.4.tgz", @@ -4958,11 +4963,12 @@ } }, "node_modules/docsgpt": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/docsgpt/-/docsgpt-0.3.0.tgz", - "integrity": "sha512-0yT2m+HAlJ+289p278c3Zi07bu2wr6zULOT/bYXtJ/nb59V2Vpfdj2xFB49+lYLSeVe8H+Ij5fFSNZ6RkVRfMQ==", + "version": "0.3.7", + "resolved": "https://registry.npmjs.org/docsgpt/-/docsgpt-0.3.7.tgz", + "integrity": "sha512-VHrXXOEFtjNTcpA8Blf3IzpLlJxOMhm/S5CM4FDjQEkdK9WWhI8yXd/0Rs/FS8oz7YbFrNxO758mlP7OtQtBBw==", "dependencies": { "@babel/plugin-transform-flow-strip-types": "^7.23.3", + "@bpmn-io/snarkdown": "^2.2.0", "@parcel/resolver-glob": "^2.12.0", "@parcel/transformer-svg-react": "^2.12.0", "@parcel/transformer-typescript-tsc": "^2.12.0", @@ -4972,6 +4978,7 @@ "@types/react-dom": "^18.2.19", "class-variance-authority": "^0.7.0", "clsx": "^2.1.0", + "dompurify": "^3.0.9", "flow-bin": "^0.229.2", "i": "^0.3.7", "install": "^0.13.0", @@ -5029,9 +5036,9 @@ } }, "node_modules/dompurify": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.7.tgz", - "integrity": "sha512-BViYTZoqP3ak/ULKOc101y+CtHDUvBsVgSxIF1ku0HmK6BRf+C03MC+tArMvOPtVtZp83DDh5puywKDu4sbVjQ==" + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.11.tgz", + "integrity": "sha512-Fan4uMuyB26gFV3ovPoEoQbxRRPfTu3CvImyZnhGq5fsIEO+gEFLp45ISFt+kQBWsK5ulDdT0oV28jS1UrwQLg==" }, "node_modules/domutils": { "version": "2.8.0", @@ -6206,9 +6213,9 @@ "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==" }, "node_modules/katex": { - "version": "0.16.9", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.9.tgz", - "integrity": "sha512-fsSYjWS0EEOwvy81j3vRA8TEAhQhKiqO+FQaKWp0m39qwOzHVBgAUBIXWj1pB+O2W3fIpNa6Y9KSKCVbfPhyAQ==", + "version": "0.16.10", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.10.tgz", + "integrity": "sha512-ZiqaC04tp2O5utMsl2TEZTXxa6WSC4yo0fv5ML++D3QZv/vx2Mct0mTlRx3O+uUkjfuAgOkzsCmq5MiUEsDDdA==", "funding": [ "https://opencollective.com/katex", "https://github.com/sponsors/katex" diff --git a/frontend/src/Setting.tsx b/frontend/src/Setting.tsx index 83cf06bf..172691a0 100644 --- a/frontend/src/Setting.tsx +++ b/frontend/src/Setting.tsx @@ -9,6 +9,8 @@ import { setPrompt, selectSourceDocs, setSourceDocs, + setChunks, + selectChunks, } from './preferences/preferenceSlice'; import { Doc } from './preferences/preferenceApi'; import { useDarkTheme } from './hooks'; @@ -190,10 +192,13 @@ const Setting: React.FC = () => { const General: React.FC = () => { const themes = ['Light', 'Dark']; const languages = ['English']; + const chunks = ['0', '2', '4', '6', '8', '10']; + const selectedChunks = useSelector(selectChunks); const [isDarkTheme, toggleTheme] = useDarkTheme(); const [selectedTheme, setSelectedTheme] = useState( isDarkTheme ? 'Dark' : 'Light', ); + const dispatch = useDispatch(); const [selectedLanguage, setSelectedLanguage] = useState(languages[0]); return (
Select Language
@@ -218,6 +223,16 @@ const General: React.FC = () => { onSelect={setSelectedLanguage} />+ Chunks processed per query +
+