From 60cfea112609df2ffdc48b97d05493a458479154 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Sat, 16 Mar 2024 20:22:05 +0530 Subject: [PATCH 1/7] feat: added reddit loader --- .gitignore | 1 + application/parser/remote/reddit_loader.py | 27 ++++ application/parser/remote/remote_creator.py | 10 +- application/worker.py | 132 ++++++++++++-------- frontend/src/components/Dropdown.tsx | 4 +- frontend/src/upload/Upload.tsx | 1 + 6 files changed, 117 insertions(+), 58 deletions(-) create mode 100644 application/parser/remote/reddit_loader.py diff --git a/.gitignore b/.gitignore index d7747efb..ac5ff190 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +**/*.ipynb # IPython profile_default/ diff --git a/application/parser/remote/reddit_loader.py b/application/parser/remote/reddit_loader.py new file mode 100644 index 00000000..f377717b --- /dev/null +++ b/application/parser/remote/reddit_loader.py @@ -0,0 +1,27 @@ +from application.parser.remote.base import BaseRemote +from langchain_community.document_loaders import RedditPostsLoader + + +class RedditPostsLoaderRemote(BaseRemote): + def load_data(self, inputs): + client_id = inputs.get("client_id") + client_secret = inputs.get("client_secret") + user_agent = inputs.get("user_agent") + categories = inputs.get("categories", ["new", "hot"]) + mode = inputs.get("mode", "subreddit") + search_queries = inputs.get("search_queries") + self.loader = RedditPostsLoader( + client_id=client_id, + client_secret=client_secret, + user_agent=user_agent, + categories=categories, + mode=mode, + search_queries=search_queries, + ) + documents = [] + try: + documents.extend(self.loader.load()) + except Exception as e: + print(f"Error processing Data: {e}") + print(f"Loaded {len(documents)} documents from Reddit") + return documents[:5] diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index e45333d4..d2a58f8d 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -1,13 +1,15 @@ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader +from application.parser.remote.reddit_loader import RedditPostsLoaderRemote class RemoteCreator: loaders = { - 'url': WebLoader, - 'sitemap': SitemapLoader, - 'crawler': CrawlerLoader + "url": WebLoader, + "sitemap": SitemapLoader, + "crawler": CrawlerLoader, + "reddit": RedditPostsLoaderRemote, } @classmethod @@ -15,4 +17,4 @@ class RemoteCreator: loader_class = cls.loaders.get(type.lower()) if not loader_class: raise ValueError(f"No LLM class found for type {type}") - return loader_class(*args, **kwargs) \ No newline at end of file + return loader_class(*args, **kwargs) diff --git a/application/worker.py b/application/worker.py index 21bb319f..b783c335 100644 --- a/application/worker.py +++ b/application/worker.py @@ -15,23 +15,27 @@ from application.parser.schema.base import Document from application.parser.token_func import group_split try: - nltk.download('punkt', quiet=True) - nltk.download('averaged_perceptron_tagger', quiet=True) + nltk.download("punkt", quiet=True) + nltk.download("averaged_perceptron_tagger", quiet=True) except FileExistsError: pass # Define a function to extract metadata from a given filename. def metadata_from_filename(title): - store = '/'.join(title.split('/')[1:3]) - return {'title': title, 'store': store} + store = "/".join(title.split("/")[1:3]) + return {"title": title, "store": store} # Define a function to generate a random string of a given length. def generate_random_string(length): - return ''.join([string.ascii_letters[i % 52] for i in range(length)]) + return "".join([string.ascii_letters[i % 52] for i in range(length)]) + + +current_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) -current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Define the main function for ingesting and processing documents. def ingest_worker(self, directory, formats, name_job, filename, user): @@ -62,38 +66,52 @@ def ingest_worker(self, directory, formats, name_job, filename, user): token_check = True min_tokens = 150 max_tokens = 1250 - full_path = directory + '/' + user + '/' + name_job + full_path = directory + "/" + user + "/" + name_job import sys + print(full_path, file=sys.stderr) # check if API_URL env variable is set - file_data = {'name': name_job, 'file': filename, 'user': user} - response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data) + file_data = {"name": name_job, "file": filename, "user": user} + response = requests.get( + urljoin(settings.API_URL, "/api/download"), params=file_data + ) # check if file is in the response print(response, file=sys.stderr) file = response.content if not os.path.exists(full_path): os.makedirs(full_path) - with open(full_path + '/' + filename, 'wb') as f: + with open(full_path + "/" + filename, "wb") as f: f.write(file) # check if file is .zip and extract it - if filename.endswith('.zip'): - with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: + if filename.endswith(".zip"): + with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref: zip_ref.extractall(full_path) - os.remove(full_path + '/' + filename) + os.remove(full_path + "/" + filename) - self.update_state(state='PROGRESS', meta={'current': 1}) + self.update_state(state="PROGRESS", meta={"current": 1}) - raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, - required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data() - raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + raw_docs = SimpleDirectoryReader( + input_dir=full_path, + input_files=input_files, + recursive=recursive, + required_exts=formats, + num_files_limit=limit, + exclude_hidden=exclude, + file_metadata=metadata_from_filename, + ).load_data() + raw_docs = group_split( + documents=raw_docs, + min_tokens=min_tokens, + max_tokens=max_tokens, + token_check=token_check, + ) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) - self.update_state(state='PROGRESS', meta={'current': 100}) + self.update_state(state="PROGRESS", meta={"current": 100}) if sample: for i in range(min(5, len(raw_docs))): @@ -101,70 +119,80 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - file_data = {'name': name_job, 'user': user} + file_data = {"name": name_job, "user": user} if settings.VECTOR_STORE == "faiss": - files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), - 'file_pkl': open(full_path + '/index.pkl', 'rb')} - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) - response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + files = { + "file_faiss": open(full_path + "/index.faiss", "rb"), + "file_pkl": open(full_path + "/index.pkl", "rb"), + } + response = requests.post( + urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data + ) + response = requests.get( + urljoin(settings.API_URL, "/api/delete_old?path=" + full_path) + ) else: - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + response = requests.post( + urljoin(settings.API_URL, "/api/upload_index"), data=file_data + ) - # delete local shutil.rmtree(full_path) return { - 'directory': directory, - 'formats': formats, - 'name_job': name_job, - 'filename': filename, - 'user': user, - 'limited': False + "directory": directory, + "formats": formats, + "name_job": name_job, + "filename": filename, + "user": user, + "limited": False, } -def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'): + +def remote_worker(self, source_data, name_job, user, directory="temp", loader="url"): # sample = False token_check = True min_tokens = 150 max_tokens = 1250 - full_path = directory + '/' + user + '/' + name_job + full_path = directory + "/" + user + "/" + name_job if not os.path.exists(full_path): os.makedirs(full_path) - self.update_state(state='PROGRESS', meta={'current': 1}) - + self.update_state(state="PROGRESS", meta={"current": 1}) + # source_data {"data": [url]} for url type task just urls - + # Use RemoteCreator to load data from URL remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) - docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + docs = group_split( + documents=raw_docs, + min_tokens=min_tokens, + max_tokens=max_tokens, + token_check=token_check, + ) - #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) - self.update_state(state='PROGRESS', meta={'current': 100}) - - + self.update_state(state="PROGRESS", meta={"current": 100}) # Proceed with uploading and cleaning as in the original function - file_data = {'name': name_job, 'user': user} + file_data = {"name": name_job, "user": user} if settings.VECTOR_STORE == "faiss": - files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), - 'file_pkl': open(full_path + '/index.pkl', 'rb')} - requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + files = { + "file_faiss": open(full_path + "/index.faiss", "rb"), + "file_pkl": open(full_path + "/index.pkl", "rb"), + } + requests.post( + urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data + ) requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) else: requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) shutil.rmtree(full_path) - return { - 'urls': source_data, - 'name_job': name_job, - 'user': user, - 'limited': False - } \ No newline at end of file + return {"urls": source_data, "name_job": name_job, "user": user, "limited": False} diff --git a/frontend/src/components/Dropdown.tsx b/frontend/src/components/Dropdown.tsx index 5654b430..7a4936b3 100644 --- a/frontend/src/components/Dropdown.tsx +++ b/frontend/src/components/Dropdown.tsx @@ -35,10 +35,10 @@ function Dropdown({ isOpen ? typeof selectedValue === 'string' ? 'rounded-t-xl' - : 'rounded-t-2xl' + : 'rounded-t-3xl' : typeof selectedValue === 'string' ? 'rounded-xl' - : 'rounded-full' + : 'rounded-3xl' }`} > {typeof selectedValue === 'string' ? ( diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index dae5656b..6870ee26 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -21,6 +21,7 @@ export default function Upload({ { label: 'Crawler', value: 'crawler' }, // { label: 'Sitemap', value: 'sitemap' }, { label: 'Link', value: 'url' }, + { label: 'Reddit', value: 'reddit' }, ]; const [urlType, setUrlType] = useState<{ label: string; value: string }>({ label: 'Link', From 577556678caf35ffa5f38a68712495476017d89b Mon Sep 17 00:00:00 2001 From: Anton Larin Date: Thu, 21 Mar 2024 10:14:48 +0100 Subject: [PATCH 2/7] Fix model selection at least for openAI LLM_NAME --- application/api/answer/routes.py | 11 +++++++---- application/core/settings.py | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py index 4c393714..4c86cf4b 100644 --- a/application/api/answer/routes.py +++ b/application/api/answer/routes.py @@ -28,12 +28,15 @@ vectors_collection = db["vectors"] prompts_collection = db["prompts"] answer = Blueprint('answer', __name__) -if settings.LLM_NAME == "gpt4": - gpt_model = 'gpt-4' +gpt_model = "" +# to have some kind of default behaviour +if settings.LLM_NAME == "openai": + gpt_model = 'gpt-3.5-turbo' elif settings.LLM_NAME == "anthropic": gpt_model = 'claude-2' -else: - gpt_model = 'gpt-3.5-turbo' + +if settings.MODEL_NAME: # in case there is particular model name configured + gpt_model = settings.MODEL_NAME # load the prompts current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/application/core/settings.py b/application/core/settings.py index 84073b7d..0e1909e6 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -9,6 +9,7 @@ current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__ class Settings(BaseSettings): LLM_NAME: str = "docsgpt" + MODEL_NAME: Optional[str] = None # when LLM_NAME is openai, MODEL_NAME can be e.g. gpt-4-turbo-preview or gpt-3.5-turbo EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" From ed081235503d881f67ac3edc8ab9aea953e39aca Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Mar 2024 14:50:56 +0000 Subject: [PATCH 3/7] Add support for setting the number of chunks processed per query --- application/api/answer/routes.py | 35 +++++++++++++++---- frontend/src/Setting.tsx | 17 ++++++++- .../src/conversation/ConversationBubble.tsx | 5 ++- frontend/src/conversation/conversationApi.ts | 6 ++++ .../src/conversation/conversationSlice.ts | 3 ++ frontend/src/preferences/preferenceSlice.ts | 17 +++++++++ frontend/src/store.ts | 2 ++ 7 files changed, 76 insertions(+), 9 deletions(-) diff --git a/application/api/answer/routes.py b/application/api/answer/routes.py index 4c86cf4b..94855026 100644 --- a/application/api/answer/routes.py +++ b/application/api/answer/routes.py @@ -98,7 +98,7 @@ def is_azure_configured(): return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME -def complete_stream(question, docsearch, chat_history, api_key, prompt_id, conversation_id): +def complete_stream(question, docsearch, chat_history, api_key, prompt_id, conversation_id, chunks=2): llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key) if prompt_id == 'default': @@ -109,8 +109,11 @@ def complete_stream(question, docsearch, chat_history, api_key, prompt_id, conve prompt = chat_combine_strict else: prompt = prompts_collection.find_one({"_id": ObjectId(prompt_id)})["content"] - - docs = docsearch.search(question, k=2) + + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) if settings.LLM_NAME == "llama.cpp": docs = [docs[0]] # join all page_content together with a newline @@ -193,6 +196,10 @@ def stream(): prompt_id = data["prompt_id"] else: prompt_id = 'default' + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 # check if active_docs is set @@ -214,7 +221,8 @@ def stream(): complete_stream(question, docsearch, chat_history=history, api_key=api_key, prompt_id=prompt_id, - conversation_id=conversation_id), mimetype="text/event-stream" + conversation_id=conversation_id, + chunks=chunks), mimetype="text/event-stream" ) @@ -240,6 +248,10 @@ def api_answer(): prompt_id = data["prompt_id"] else: prompt_id = 'default' + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 if prompt_id == 'default': prompt = chat_combine_template @@ -263,7 +275,10 @@ def api_answer(): - docs = docsearch.search(question, k=2) + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) # join all page_content together with a newline docs_together = "\n".join([doc.page_content for doc in docs]) p_chat_combine = prompt.replace("{summaries}", docs_together) @@ -362,9 +377,15 @@ def api_search(): vectorstore = get_vectorstore({"active_docs": data["active_docs"]}) else: vectorstore = "" + if 'chunks' in data: + chunks = int(data["chunks"]) + else: + chunks = 2 docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key) - - docs = docsearch.search(question, k=2) + if chunks == 0: + docs = [] + else: + docs = docsearch.search(question, k=chunks) source_log_docs = [] for doc in docs: diff --git a/frontend/src/Setting.tsx b/frontend/src/Setting.tsx index 56423797..46999fd1 100644 --- a/frontend/src/Setting.tsx +++ b/frontend/src/Setting.tsx @@ -8,6 +8,8 @@ import { setPrompt, selectSourceDocs, setSourceDocs, + setChunks, + selectChunks, } from './preferences/preferenceSlice'; import { Doc } from './preferences/preferenceApi'; import { useDarkTheme } from './hooks'; @@ -193,10 +195,13 @@ const Setting: React.FC = () => { const General: React.FC = () => { const themes = ['Light', 'Dark']; const languages = ['English']; + const chunks = ['0', '2', '4', '6', '8', '10']; + const selectedChunks = useSelector(selectChunks); const [isDarkTheme, toggleTheme] = useDarkTheme(); const [selectedTheme, setSelectedTheme] = useState( isDarkTheme ? 'Dark' : 'Light', ); + const dispatch = useDispatch(); const [selectedLanguage, setSelectedLanguage] = useState(languages[0]); return (
@@ -211,7 +216,7 @@ const General: React.FC = () => { }} />
-
+

Select Language

@@ -221,6 +226,16 @@ const General: React.FC = () => { onSelect={setSelectedLanguage} />
+
+

+ Chunks processed per query +

+ dispatch(setChunks(value))} + /> +
); }; diff --git a/frontend/src/conversation/ConversationBubble.tsx b/frontend/src/conversation/ConversationBubble.tsx index b95b413b..e8caf2f9 100644 --- a/frontend/src/conversation/ConversationBubble.tsx +++ b/frontend/src/conversation/ConversationBubble.tsx @@ -160,7 +160,10 @@ const ConversationBubble = forwardRef< > {message} - {DisableSourceFE || type === 'ERROR' ? null : ( + {DisableSourceFE || + type === 'ERROR' || + !sources || + sources.length === 0 ? null : ( <>
diff --git a/frontend/src/conversation/conversationApi.ts b/frontend/src/conversation/conversationApi.ts index 8293df1b..d8d76937 100644 --- a/frontend/src/conversation/conversationApi.ts +++ b/frontend/src/conversation/conversationApi.ts @@ -11,6 +11,7 @@ export function fetchAnswerApi( history: Array = [], conversationId: string | null, promptId: string | null, + chunks: string, ): Promise< | { result: any; @@ -65,6 +66,7 @@ export function fetchAnswerApi( active_docs: docPath, conversation_id: conversationId, prompt_id: promptId, + chunks: chunks, }), signal, }) @@ -95,6 +97,7 @@ export function fetchAnswerSteaming( history: Array = [], conversationId: string | null, promptId: string | null, + chunks: string, onEvent: (event: MessageEvent) => void, ): Promise { let namePath = selectedDocs.name; @@ -130,6 +133,7 @@ export function fetchAnswerSteaming( history: JSON.stringify(history), conversation_id: conversationId, prompt_id: promptId, + chunks: chunks, }; fetch(apiHost + '/stream', { method: 'POST', @@ -192,6 +196,7 @@ export function searchEndpoint( selectedDocs: Doc, conversation_id: string | null, history: Array = [], + chunks: string, ) { /* "active_docs": "default", @@ -223,6 +228,7 @@ export function searchEndpoint( active_docs: docPath, conversation_id, history, + chunks: chunks, }; return fetch(`${apiHost}/api/search`, { method: 'POST', diff --git a/frontend/src/conversation/conversationSlice.ts b/frontend/src/conversation/conversationSlice.ts index 35aadd9a..85fc3510 100644 --- a/frontend/src/conversation/conversationSlice.ts +++ b/frontend/src/conversation/conversationSlice.ts @@ -28,6 +28,7 @@ export const fetchAnswer = createAsyncThunk( state.conversation.queries, state.conversation.conversationId, state.preference.prompt.id, + state.preference.chunks, (event) => { const data = JSON.parse(event.data); @@ -51,6 +52,7 @@ export const fetchAnswer = createAsyncThunk( state.preference.selectedDocs!, state.conversation.conversationId, state.conversation.queries, + state.preference.chunks, ).then((sources) => { //dispatch streaming sources dispatch( @@ -86,6 +88,7 @@ export const fetchAnswer = createAsyncThunk( state.conversation.queries, state.conversation.conversationId, state.preference.prompt.id, + state.preference.chunks, ); if (answer) { let sourcesPrepped = []; diff --git a/frontend/src/preferences/preferenceSlice.ts b/frontend/src/preferences/preferenceSlice.ts index 0aa8b3b5..dc72fae1 100644 --- a/frontend/src/preferences/preferenceSlice.ts +++ b/frontend/src/preferences/preferenceSlice.ts @@ -10,6 +10,7 @@ interface Preference { apiKey: string; prompt: { name: string; id: string; type: string }; selectedDocs: Doc | null; + chunks: string; sourceDocs: Doc[] | null; conversations: { name: string; id: string }[] | null; } @@ -17,6 +18,7 @@ interface Preference { const initialState: Preference = { apiKey: 'xxx', prompt: { name: 'default', id: 'default', type: 'public' }, + chunks: '2', selectedDocs: { name: 'default', language: 'default', @@ -51,6 +53,9 @@ export const prefSlice = createSlice({ setPrompt: (state, action) => { state.prompt = action.payload; }, + setChunks: (state, action) => { + state.chunks = action.payload; + }, }, }); @@ -60,6 +65,7 @@ export const { setSourceDocs, setConversations, setPrompt, + setChunks, } = prefSlice.actions; export default prefSlice.reducer; @@ -91,6 +97,16 @@ prefListenerMiddleware.startListening({ }, }); +prefListenerMiddleware.startListening({ + matcher: isAnyOf(setChunks), + effect: (action, listenerApi) => { + localStorage.setItem( + 'DocsGPTChunks', + JSON.stringify((listenerApi.getState() as RootState).preference.chunks), + ); + }, +}); + export const selectApiKey = (state: RootState) => state.preference.apiKey; export const selectApiKeyStatus = (state: RootState) => !!state.preference.apiKey; @@ -105,3 +121,4 @@ export const selectConversations = (state: RootState) => export const selectConversationId = (state: RootState) => state.conversation.conversationId; export const selectPrompt = (state: RootState) => state.preference.prompt; +export const selectChunks = (state: RootState) => state.preference.chunks; diff --git a/frontend/src/store.ts b/frontend/src/store.ts index 234cc8e9..232675a7 100644 --- a/frontend/src/store.ts +++ b/frontend/src/store.ts @@ -8,11 +8,13 @@ import { const key = localStorage.getItem('DocsGPTApiKey'); const prompt = localStorage.getItem('DocsGPTPrompt'); const doc = localStorage.getItem('DocsGPTRecentDocs'); +const chunks = localStorage.getItem('DocsGPTChunks'); const store = configureStore({ preloadedState: { preference: { apiKey: key ?? '', + chunks: JSON.parse(chunks ?? '2'), selectedDocs: doc !== null ? JSON.parse(doc) : null, prompt: prompt !== null From 3c492062a97947c93d6639e99e751bcffa28bf59 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 23 Mar 2024 11:42:50 +0000 Subject: [PATCH 4/7] Fix parsing issue with chunks in store.ts --- frontend/src/store.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/store.ts b/frontend/src/store.ts index 232675a7..c217648e 100644 --- a/frontend/src/store.ts +++ b/frontend/src/store.ts @@ -14,7 +14,7 @@ const store = configureStore({ preloadedState: { preference: { apiKey: key ?? '', - chunks: JSON.parse(chunks ?? '2'), + chunks: JSON.parse(chunks ?? '2').toString(), selectedDocs: doc !== null ? JSON.parse(doc) : null, prompt: prompt !== null From 31d947837fa56b1bb360fa1085b914f7d8ace4ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Mar 2024 20:31:43 +0000 Subject: [PATCH 5/7] Bump katex from 0.16.9 to 0.16.10 in /docs Bumps [katex](https://github.com/KaTeX/KaTeX) from 0.16.9 to 0.16.10. - [Release notes](https://github.com/KaTeX/KaTeX/releases) - [Changelog](https://github.com/KaTeX/KaTeX/blob/main/CHANGELOG.md) - [Commits](https://github.com/KaTeX/KaTeX/compare/v0.16.9...v0.16.10) --- updated-dependencies: - dependency-name: katex dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/package-lock.json | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/docs/package-lock.json b/docs/package-lock.json index dc608479..e2c0f301 100644 --- a/docs/package-lock.json +++ b/docs/package-lock.json @@ -7,7 +7,7 @@ "license": "MIT", "dependencies": { "@vercel/analytics": "^1.1.1", - "docsgpt": "^0.3.0", + "docsgpt": "^0.3.7", "next": "^14.0.4", "nextra": "^2.13.2", "nextra-theme-docs": "^2.13.2", @@ -422,6 +422,11 @@ "node": ">=6.9.0" } }, + "node_modules/@bpmn-io/snarkdown": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@bpmn-io/snarkdown/-/snarkdown-2.2.0.tgz", + "integrity": "sha512-bVD7FIoaBDZeCJkMRgnBPDeptPlto87wt2qaCjf5t8iLaevDmTPaREd6FpBEGsHlUdHFFZWRk4qAoEC5So2M0Q==" + }, "node_modules/@braintree/sanitize-url": { "version": "6.0.4", "resolved": "https://registry.npmjs.org/@braintree/sanitize-url/-/sanitize-url-6.0.4.tgz", @@ -4958,11 +4963,12 @@ } }, "node_modules/docsgpt": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/docsgpt/-/docsgpt-0.3.0.tgz", - "integrity": "sha512-0yT2m+HAlJ+289p278c3Zi07bu2wr6zULOT/bYXtJ/nb59V2Vpfdj2xFB49+lYLSeVe8H+Ij5fFSNZ6RkVRfMQ==", + "version": "0.3.7", + "resolved": "https://registry.npmjs.org/docsgpt/-/docsgpt-0.3.7.tgz", + "integrity": "sha512-VHrXXOEFtjNTcpA8Blf3IzpLlJxOMhm/S5CM4FDjQEkdK9WWhI8yXd/0Rs/FS8oz7YbFrNxO758mlP7OtQtBBw==", "dependencies": { "@babel/plugin-transform-flow-strip-types": "^7.23.3", + "@bpmn-io/snarkdown": "^2.2.0", "@parcel/resolver-glob": "^2.12.0", "@parcel/transformer-svg-react": "^2.12.0", "@parcel/transformer-typescript-tsc": "^2.12.0", @@ -4972,6 +4978,7 @@ "@types/react-dom": "^18.2.19", "class-variance-authority": "^0.7.0", "clsx": "^2.1.0", + "dompurify": "^3.0.9", "flow-bin": "^0.229.2", "i": "^0.3.7", "install": "^0.13.0", @@ -5029,9 +5036,9 @@ } }, "node_modules/dompurify": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.7.tgz", - "integrity": "sha512-BViYTZoqP3ak/ULKOc101y+CtHDUvBsVgSxIF1ku0HmK6BRf+C03MC+tArMvOPtVtZp83DDh5puywKDu4sbVjQ==" + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.0.11.tgz", + "integrity": "sha512-Fan4uMuyB26gFV3ovPoEoQbxRRPfTu3CvImyZnhGq5fsIEO+gEFLp45ISFt+kQBWsK5ulDdT0oV28jS1UrwQLg==" }, "node_modules/domutils": { "version": "2.8.0", @@ -6206,9 +6213,9 @@ "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==" }, "node_modules/katex": { - "version": "0.16.9", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.9.tgz", - "integrity": "sha512-fsSYjWS0EEOwvy81j3vRA8TEAhQhKiqO+FQaKWp0m39qwOzHVBgAUBIXWj1pB+O2W3fIpNa6Y9KSKCVbfPhyAQ==", + "version": "0.16.10", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.10.tgz", + "integrity": "sha512-ZiqaC04tp2O5utMsl2TEZTXxa6WSC4yo0fv5ML++D3QZv/vx2Mct0mTlRx3O+uUkjfuAgOkzsCmq5MiUEsDDdA==", "funding": [ "https://opencollective.com/katex", "https://github.com/sponsors/katex" From eed1bfbe50e191dbdf0e5d7aca15618cd77e7612 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Tue, 26 Mar 2024 16:07:44 +0530 Subject: [PATCH 6/7] feat: fields to handle reddit loader + minor changes --- application/parser/remote/reddit_loader.py | 22 ++-- application/worker.py | 2 +- frontend/src/upload/Upload.tsx | 134 +++++++++++++++++---- 3 files changed, 120 insertions(+), 38 deletions(-) diff --git a/application/parser/remote/reddit_loader.py b/application/parser/remote/reddit_loader.py index f377717b..3c9f93ea 100644 --- a/application/parser/remote/reddit_loader.py +++ b/application/parser/remote/reddit_loader.py @@ -4,12 +4,13 @@ from langchain_community.document_loaders import RedditPostsLoader class RedditPostsLoaderRemote(BaseRemote): def load_data(self, inputs): - client_id = inputs.get("client_id") - client_secret = inputs.get("client_secret") - user_agent = inputs.get("user_agent") - categories = inputs.get("categories", ["new", "hot"]) - mode = inputs.get("mode", "subreddit") - search_queries = inputs.get("search_queries") + data = eval(inputs) + client_id = data.get("client_id") + client_secret = data.get("client_secret") + user_agent = data.get("user_agent") + categories = data.get("categories", ["new", "hot"]) + mode = data.get("mode", "subreddit") + search_queries = data.get("search_queries") self.loader = RedditPostsLoader( client_id=client_id, client_secret=client_secret, @@ -17,11 +18,8 @@ class RedditPostsLoaderRemote(BaseRemote): categories=categories, mode=mode, search_queries=search_queries, + number_posts=10, ) - documents = [] - try: - documents.extend(self.loader.load()) - except Exception as e: - print(f"Error processing Data: {e}") + documents = self.loader.load() print(f"Loaded {len(documents)} documents from Reddit") - return documents[:5] + return documents diff --git a/application/worker.py b/application/worker.py index b783c335..3891fde9 100644 --- a/application/worker.py +++ b/application/worker.py @@ -149,7 +149,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): } -def remote_worker(self, source_data, name_job, user, directory="temp", loader="url"): +def remote_worker(self, source_data, name_job, user, loader, directory="temp"): # sample = False token_check = True min_tokens = 150 diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 6870ee26..1614375d 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -17,6 +17,12 @@ export default function Upload({ const [docName, setDocName] = useState(''); const [urlName, setUrlName] = useState(''); const [url, setUrl] = useState(''); + const [redditData, setRedditData] = useState({ + client_id: '', + client_secret: '', + user_agent: '', + search_queries: [''], + }); const urlOptions: { label: string; value: string }[] = [ { label: 'Crawler', value: 'crawler' }, // { label: 'Sitemap', value: 'sitemap' }, @@ -164,7 +170,6 @@ export default function Upload({ }; const uploadRemote = () => { - console.log('here'); const formData = new FormData(); formData.append('name', urlName); formData.append('user', 'local'); @@ -172,6 +177,13 @@ export default function Upload({ formData.append('source', urlType?.value); } formData.append('data', url); + if ( + redditData.client_id.length > 0 && + redditData.client_secret.length > 0 + ) { + formData.set('name', 'other'); + formData.set('data', JSON.stringify(redditData)); + } const apiHost = import.meta.env.VITE_API_HOST; const xhr = new XMLHttpRequest(); xhr.upload.addEventListener('progress', (event) => { @@ -203,6 +215,19 @@ export default function Upload({ ['.docx'], }, }); + const handleChange = (e: React.ChangeEvent) => { + const { name, value } = e.target; + if (name === 'search_queries' && value.length > 0) { + setRedditData({ + ...redditData, + [name]: value.split(',').map((item) => item.trim()), + }); + } else + setRedditData({ + ...redditData, + [name]: value, + }); + }; let view; if (progress?.type === 'UPLOAD') { view = ; @@ -282,30 +307,89 @@ export default function Upload({ setUrlType(value) } /> - setUrlName(e.target.value)} - > -
- - Name - -
- setUrl(e.target.value)} - > -
- - Link - -
+ {urlType.label !== 'Reddit' ? ( + <> + setUrlName(e.target.value)} + > +
+ + Name + +
+ setUrl(e.target.value)} + > +
+ + Link + +
+ + ) : ( + <> + +
+ + Client ID + +
+ +
+ + Client secret + +
+ +
+ + User agent + +
+ +
+ + Search queries + +
+ + )} )}
From e01071426f1fc989f463943cfbbb16bf32086be9 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Wed, 27 Mar 2024 19:20:55 +0530 Subject: [PATCH 7/7] feat: field to pass number of posts as a parameter --- application/parser/remote/reddit_loader.py | 3 ++- frontend/src/upload/Upload.tsx | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/application/parser/remote/reddit_loader.py b/application/parser/remote/reddit_loader.py index 3c9f93ea..0230653a 100644 --- a/application/parser/remote/reddit_loader.py +++ b/application/parser/remote/reddit_loader.py @@ -11,6 +11,7 @@ class RedditPostsLoaderRemote(BaseRemote): categories = data.get("categories", ["new", "hot"]) mode = data.get("mode", "subreddit") search_queries = data.get("search_queries") + number_posts = data.get("number_posts", 10) self.loader = RedditPostsLoader( client_id=client_id, client_secret=client_secret, @@ -18,7 +19,7 @@ class RedditPostsLoaderRemote(BaseRemote): categories=categories, mode=mode, search_queries=search_queries, - number_posts=10, + number_posts=number_posts, ) documents = self.loader.load() print(f"Loaded {len(documents)} documents from Reddit") diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 1614375d..45fc4e1a 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -22,6 +22,7 @@ export default function Upload({ client_secret: '', user_agent: '', search_queries: [''], + number_posts: 10, }); const urlOptions: { label: string; value: string }[] = [ { label: 'Crawler', value: 'crawler' }, @@ -388,6 +389,19 @@ export default function Upload({ Search queries
+ +
+ + Number of posts + +
)}