From 38753c4395484932362749fe4c14533bb3cc5629 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:09:34 +0000 Subject: [PATCH 01/26] build(deps): bump yarl from 1.11.1 to 1.18.3 in /application Bumps [yarl](https://github.com/aio-libs/yarl) from 1.11.1 to 1.18.3. - [Release notes](https://github.com/aio-libs/yarl/releases) - [Changelog](https://github.com/aio-libs/yarl/blob/master/CHANGES.rst) - [Commits](https://github.com/aio-libs/yarl/compare/v1.11.1...v1.18.3) --- updated-dependencies: - dependency-name: yarl dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 232b8508..13e9a30a 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -86,4 +86,4 @@ urllib3==2.2.3 vine==5.1.0 wcwidth==0.2.13 werkzeug==3.1.3 -yarl==1.11.1 \ No newline at end of file +yarl==1.18.3 \ No newline at end of file From c4f3dc4434402b7223e01feb8a8f2dde63018ac4 Mon Sep 17 00:00:00 2001 From: Pavel Date: Fri, 20 Dec 2024 18:41:47 +0300 Subject: [PATCH 02/26] test version --- application/parser/chunking.py | 118 +++++++++++++++++++++++ application/parser/embedding_pipeline.py | 86 +++++++++++++++++ application/parser/open_ai_func.py | 75 -------------- application/parser/token_func.py | 79 --------------- application/worker.py | 30 +++--- 5 files changed, 221 insertions(+), 167 deletions(-) create mode 100644 application/parser/chunking.py create mode 100755 application/parser/embedding_pipeline.py delete mode 100755 application/parser/open_ai_func.py delete mode 100644 application/parser/token_func.py diff --git a/application/parser/chunking.py b/application/parser/chunking.py new file mode 100644 index 00000000..26f05dba --- /dev/null +++ b/application/parser/chunking.py @@ -0,0 +1,118 @@ +import re +from typing import List, Tuple, Union +import logging +from application.parser.schema.base import Document +from application.utils import get_encoding + +logger = logging.getLogger(__name__) + +class Chunker: + def __init__( + self, + chunking_strategy: str = "classic_chunk", + max_tokens: int = 2000, + min_tokens: int = 150, + duplicate_headers: bool = False, + ): + if chunking_strategy not in ["classic_chunk"]: + raise ValueError(f"Unsupported chunking strategy: {chunking_strategy}") + self.chunking_strategy = chunking_strategy + self.max_tokens = max_tokens + self.min_tokens = min_tokens + self.duplicate_headers = duplicate_headers + self.encoding = get_encoding() + + def separate_header_and_body(self, text: str) -> Tuple[str, str]: + header_pattern = r"^(.*?\n){3}" + match = re.match(header_pattern, text) + if match: + header = match.group(0) + body = text[len(header):] + else: + header, body = "", text # No header, treat entire text as body + return header, body + + def combine_documents(self, doc: Document, next_doc: Document) -> Document: + combined_text = doc.text + " " + next_doc.text + combined_token_count = len(self.encoding.encode(combined_text)) + new_doc = Document( + text=combined_text, + doc_id=doc.doc_id, + embedding=doc.embedding, + extra_info={**(doc.extra_info or {}), "token_count": combined_token_count} + ) + return new_doc + + def split_document(self, doc: Document) -> List[Document]: + split_docs = [] + header, body = self.separate_header_and_body(doc.text) + header_tokens = self.encoding.encode(header) if header else [] + body_tokens = self.encoding.encode(body) + + current_position = 0 + part_index = 0 + while current_position < len(body_tokens): + end_position = current_position + self.max_tokens - len(header_tokens) + chunk_tokens = (header_tokens + body_tokens[current_position:end_position] + if self.duplicate_headers or part_index == 0 else body_tokens[current_position:end_position]) + chunk_text = self.encoding.decode(chunk_tokens) + new_doc = Document( + text=chunk_text, + doc_id=f"{doc.doc_id}-{part_index}", + embedding=doc.embedding, + extra_info={**(doc.extra_info or {}), "token_count": len(chunk_tokens)} + ) + split_docs.append(new_doc) + current_position = end_position + part_index += 1 + header_tokens = [] + return split_docs + + def classic_chunk(self, documents: List[Document]) -> List[Document]: + processed_docs = [] + i = 0 + while i < len(documents): + doc = documents[i] + tokens = self.encoding.encode(doc.text) + token_count = len(tokens) + + if self.min_tokens <= token_count <= self.max_tokens: + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + elif token_count < self.min_tokens: + if i + 1 < len(documents): + next_doc = documents[i + 1] + next_tokens = self.encoding.encode(next_doc.text) + if token_count + len(next_tokens) <= self.max_tokens: + # Combine small documents + combined_doc = self.combine_documents(doc, next_doc) + processed_docs.append(combined_doc) + i += 2 + else: + # Keep the small document as is if adding next_doc would exceed max_tokens + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + else: + # No next document to combine with; add the small document as is + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + else: + # Split large documents + processed_docs.extend(self.split_document(doc)) + i += 1 + return processed_docs + + def chunk( + self, + documents: List[Document] + ) -> List[Document]: + if self.chunking_strategy == "classic_chunk": + return self.classic_chunk(documents) + else: + raise ValueError("Unsupported chunking strategy") diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py new file mode 100755 index 00000000..6cf40048 --- /dev/null +++ b/application/parser/embedding_pipeline.py @@ -0,0 +1,86 @@ +import os +import logging +from retry import retry +from tqdm import tqdm +from application.core.settings import settings +from application.vectorstore.vector_creator import VectorCreator + + +@retry(tries=10, delay=60) +def add_text_to_store_with_retry(store, doc, source_id): + """ + Add a document's text and metadata to the vector store with retry logic. + Args: + store: The vector store object. + doc: The document to be added. + source_id: Unique identifier for the source. + """ + try: + doc.metadata["source_id"] = str(source_id) + store.add_texts([doc.page_content], metadatas=[doc.metadata]) + except Exception as e: + logging.error(f"Failed to add document with retry: {e}") + raise + + +def embed_and_store_documents(docs, folder_name, source_id, task_status): + """ + Embeds documents and stores them in a vector store. + + Args: + docs (list): List of documents to be embedded and stored. + folder_name (str): Directory to save the vector store. + source_id (str): Unique identifier for the source. + task_status: Task state manager for progress updates. + + Returns: + None + """ + # Ensure the folder exists + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + # Initialize vector store + if settings.VECTOR_STORE == "faiss": + docs_init = [docs.pop(0)] + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + docs_init=docs_init, + source_id=folder_name, + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + else: + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + source_id=source_id, + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + store.delete_index() + + total_docs = len(docs) + + # Process and embed documents + for idx, doc in tqdm( + docs, + desc="Embedding 🦖", + unit="docs", + total=total_docs, + bar_format="{l_bar}{bar}| Time Left: {remaining}", + ): + try: + # Update task status for progress tracking + progress = int((idx / total_docs) * 100) + task_status.update_state(state="PROGRESS", meta={"current": progress}) + + # Add document to vector store + add_text_to_store_with_retry(store, doc, source_id) + except Exception as e: + logging.error(f"Error embedding document {idx}: {e}") + logging.info(f"Saving progress at document {idx} out of {total_docs}") + store.save_local(folder_name) + break + + # Save the vector store + if settings.VECTOR_STORE == "faiss": + store.save_local(folder_name) + logging.info("Vector store saved successfully.") diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py deleted file mode 100755 index 3109f583..00000000 --- a/application/parser/open_ai_func.py +++ /dev/null @@ -1,75 +0,0 @@ -import os - -from retry import retry - -from application.core.settings import settings - -from application.vectorstore.vector_creator import VectorCreator - - -# from langchain_community.embeddings import HuggingFaceEmbeddings -# from langchain_community.embeddings import HuggingFaceInstructEmbeddings -# from langchain_community.embeddings import CohereEmbeddings - - -@retry(tries=10, delay=60) -def store_add_texts_with_retry(store, i, id): - # add source_id to the metadata - i.metadata["source_id"] = str(id) - store.add_texts([i.page_content], metadatas=[i.metadata]) - # store_pine.add_texts([i.page_content], metadatas=[i.metadata]) - - -def call_openai_api(docs, folder_name, id, task_status): - # Function to create a vector store from the documents and save it to disk - - if not os.path.exists(f"{folder_name}"): - os.makedirs(f"{folder_name}") - - from tqdm import tqdm - - c1 = 0 - if settings.VECTOR_STORE == "faiss": - docs_init = [docs[0]] - docs.pop(0) - - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - docs_init=docs_init, - source_id=f"{folder_name}", - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - else: - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - source_id=str(id), - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - store.delete_index() - # Uncomment for MPNet embeddings - # model_name = "sentence-transformers/all-mpnet-base-v2" - # hf = HuggingFaceEmbeddings(model_name=model_name) - # store = FAISS.from_documents(docs_test, hf) - s1 = len(docs) - for i in tqdm( - docs, - desc="Embedding 🦖", - unit="docs", - total=len(docs), - bar_format="{l_bar}{bar}| Time Left: {remaining}", - ): - try: - task_status.update_state( - state="PROGRESS", meta={"current": int((c1 / s1) * 100)} - ) - store_add_texts_with_retry(store, i, id) - except Exception as e: - print(e) - print("Error on ", i) - print("Saving progress") - print(f"stopped at {c1} out of {len(docs)}") - store.save_local(f"{folder_name}") - break - c1 += 1 - if settings.VECTOR_STORE == "faiss": - store.save_local(f"{folder_name}") diff --git a/application/parser/token_func.py b/application/parser/token_func.py deleted file mode 100644 index 7511cde0..00000000 --- a/application/parser/token_func.py +++ /dev/null @@ -1,79 +0,0 @@ -import re -from math import ceil -from typing import List - -import tiktoken -from application.parser.schema.base import Document - - -def separate_header_and_body(text): - header_pattern = r"^(.*?\n){3}" - match = re.match(header_pattern, text) - header = match.group(0) - body = text[len(header):] - return header, body - - -def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: - docs = [] - current_group = None - - for doc in documents: - doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - - # Check if current group is empty or if the document can be added based on token count and matching metadata - if (current_group is None or - (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and - doc_len < min_tokens and - current_group.extra_info == doc.extra_info)): - if current_group is None: - current_group = doc # Use the document directly to retain its metadata - else: - current_group.text += " " + doc.text # Append text to the current group - else: - docs.append(current_group) - current_group = doc # Start a new group with the current document - - if current_group is not None: - docs.append(current_group) - - return docs - - -def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: - docs = [] - for doc in documents: - token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - if token_length <= max_tokens: - docs.append(doc) - else: - header, body = separate_header_and_body(doc.text) - if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens: - body = doc.text - header = "" - num_body_parts = ceil(token_length / max_tokens) - part_length = ceil(len(body) / num_body_parts) - body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] - for i, body_part in enumerate(body_parts): - new_doc = Document(text=header + body_part.strip(), - doc_id=f"{doc.doc_id}-{i}", - embedding=doc.embedding, - extra_info=doc.extra_info) - docs.append(new_doc) - return docs - - -def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): - if not token_check: - return documents - print("Grouping small documents") - try: - documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") - print("Separating large documents") - try: - documents = split_documents(documents=documents, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") - return documents diff --git a/application/worker.py b/application/worker.py index 33cd90e5..0edb46ff 100755 --- a/application/worker.py +++ b/application/worker.py @@ -12,10 +12,10 @@ from bson.objectid import ObjectId from application.core.mongo_db import MongoDB from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader -from application.parser.open_ai_func import call_openai_api +from application.parser.embedding_pipeline import embed_and_store_documents from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document -from application.parser.token_func import group_split +from application.parser.chunking import Chunker from application.utils import count_tokens_docs mongo = MongoDB.get_client() @@ -153,17 +153,19 @@ def ingest_worker( exclude_hidden=exclude, file_metadata=metadata_from_filename, ).load_data() - raw_docs = group_split( - documents=raw_docs, - min_tokens=MIN_TOKENS, + + chunker = Chunker( + chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, - token_check=token_check, + min_tokens=MIN_TOKENS, + duplicate_headers=False ) + raw_docs = chunker.chunk(documents=raw_docs) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] id = ObjectId() - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) @@ -217,21 +219,23 @@ def remote_worker( remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) - docs = group_split( - documents=raw_docs, - min_tokens=MIN_TOKENS, + chunker = Chunker( + chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, - token_check=token_check, + min_tokens=MIN_TOKENS, + duplicate_headers=False ) + docs = chunker.chunk(documents=raw_docs) + tokens = count_tokens_docs(docs) if operation_mode == "upload": id = ObjectId() - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) elif operation_mode == "sync": if not doc_id or not ObjectId.is_valid(doc_id): raise ValueError("doc_id must be provided for sync operation.") id = ObjectId(doc_id) - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) self.update_state(state="PROGRESS", meta={"current": 100}) file_data = { From b41a989051b437112f7a05ee36d8cb5df877d84b Mon Sep 17 00:00:00 2001 From: Pavel Date: Fri, 20 Dec 2024 18:41:47 +0300 Subject: [PATCH 03/26] test version --- application/parser/chunking.py | 118 +++++++++++++++++++++++ application/parser/embedding_pipeline.py | 86 +++++++++++++++++ application/parser/open_ai_func.py | 75 -------------- application/parser/token_func.py | 79 --------------- application/worker.py | 30 +++--- 5 files changed, 221 insertions(+), 167 deletions(-) create mode 100644 application/parser/chunking.py create mode 100755 application/parser/embedding_pipeline.py delete mode 100755 application/parser/open_ai_func.py delete mode 100644 application/parser/token_func.py diff --git a/application/parser/chunking.py b/application/parser/chunking.py new file mode 100644 index 00000000..26f05dba --- /dev/null +++ b/application/parser/chunking.py @@ -0,0 +1,118 @@ +import re +from typing import List, Tuple, Union +import logging +from application.parser.schema.base import Document +from application.utils import get_encoding + +logger = logging.getLogger(__name__) + +class Chunker: + def __init__( + self, + chunking_strategy: str = "classic_chunk", + max_tokens: int = 2000, + min_tokens: int = 150, + duplicate_headers: bool = False, + ): + if chunking_strategy not in ["classic_chunk"]: + raise ValueError(f"Unsupported chunking strategy: {chunking_strategy}") + self.chunking_strategy = chunking_strategy + self.max_tokens = max_tokens + self.min_tokens = min_tokens + self.duplicate_headers = duplicate_headers + self.encoding = get_encoding() + + def separate_header_and_body(self, text: str) -> Tuple[str, str]: + header_pattern = r"^(.*?\n){3}" + match = re.match(header_pattern, text) + if match: + header = match.group(0) + body = text[len(header):] + else: + header, body = "", text # No header, treat entire text as body + return header, body + + def combine_documents(self, doc: Document, next_doc: Document) -> Document: + combined_text = doc.text + " " + next_doc.text + combined_token_count = len(self.encoding.encode(combined_text)) + new_doc = Document( + text=combined_text, + doc_id=doc.doc_id, + embedding=doc.embedding, + extra_info={**(doc.extra_info or {}), "token_count": combined_token_count} + ) + return new_doc + + def split_document(self, doc: Document) -> List[Document]: + split_docs = [] + header, body = self.separate_header_and_body(doc.text) + header_tokens = self.encoding.encode(header) if header else [] + body_tokens = self.encoding.encode(body) + + current_position = 0 + part_index = 0 + while current_position < len(body_tokens): + end_position = current_position + self.max_tokens - len(header_tokens) + chunk_tokens = (header_tokens + body_tokens[current_position:end_position] + if self.duplicate_headers or part_index == 0 else body_tokens[current_position:end_position]) + chunk_text = self.encoding.decode(chunk_tokens) + new_doc = Document( + text=chunk_text, + doc_id=f"{doc.doc_id}-{part_index}", + embedding=doc.embedding, + extra_info={**(doc.extra_info or {}), "token_count": len(chunk_tokens)} + ) + split_docs.append(new_doc) + current_position = end_position + part_index += 1 + header_tokens = [] + return split_docs + + def classic_chunk(self, documents: List[Document]) -> List[Document]: + processed_docs = [] + i = 0 + while i < len(documents): + doc = documents[i] + tokens = self.encoding.encode(doc.text) + token_count = len(tokens) + + if self.min_tokens <= token_count <= self.max_tokens: + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + elif token_count < self.min_tokens: + if i + 1 < len(documents): + next_doc = documents[i + 1] + next_tokens = self.encoding.encode(next_doc.text) + if token_count + len(next_tokens) <= self.max_tokens: + # Combine small documents + combined_doc = self.combine_documents(doc, next_doc) + processed_docs.append(combined_doc) + i += 2 + else: + # Keep the small document as is if adding next_doc would exceed max_tokens + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + else: + # No next document to combine with; add the small document as is + doc.extra_info = doc.extra_info or {} + doc.extra_info["token_count"] = token_count + processed_docs.append(doc) + i += 1 + else: + # Split large documents + processed_docs.extend(self.split_document(doc)) + i += 1 + return processed_docs + + def chunk( + self, + documents: List[Document] + ) -> List[Document]: + if self.chunking_strategy == "classic_chunk": + return self.classic_chunk(documents) + else: + raise ValueError("Unsupported chunking strategy") diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py new file mode 100755 index 00000000..6cf40048 --- /dev/null +++ b/application/parser/embedding_pipeline.py @@ -0,0 +1,86 @@ +import os +import logging +from retry import retry +from tqdm import tqdm +from application.core.settings import settings +from application.vectorstore.vector_creator import VectorCreator + + +@retry(tries=10, delay=60) +def add_text_to_store_with_retry(store, doc, source_id): + """ + Add a document's text and metadata to the vector store with retry logic. + Args: + store: The vector store object. + doc: The document to be added. + source_id: Unique identifier for the source. + """ + try: + doc.metadata["source_id"] = str(source_id) + store.add_texts([doc.page_content], metadatas=[doc.metadata]) + except Exception as e: + logging.error(f"Failed to add document with retry: {e}") + raise + + +def embed_and_store_documents(docs, folder_name, source_id, task_status): + """ + Embeds documents and stores them in a vector store. + + Args: + docs (list): List of documents to be embedded and stored. + folder_name (str): Directory to save the vector store. + source_id (str): Unique identifier for the source. + task_status: Task state manager for progress updates. + + Returns: + None + """ + # Ensure the folder exists + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + # Initialize vector store + if settings.VECTOR_STORE == "faiss": + docs_init = [docs.pop(0)] + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + docs_init=docs_init, + source_id=folder_name, + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + else: + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + source_id=source_id, + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + store.delete_index() + + total_docs = len(docs) + + # Process and embed documents + for idx, doc in tqdm( + docs, + desc="Embedding 🦖", + unit="docs", + total=total_docs, + bar_format="{l_bar}{bar}| Time Left: {remaining}", + ): + try: + # Update task status for progress tracking + progress = int((idx / total_docs) * 100) + task_status.update_state(state="PROGRESS", meta={"current": progress}) + + # Add document to vector store + add_text_to_store_with_retry(store, doc, source_id) + except Exception as e: + logging.error(f"Error embedding document {idx}: {e}") + logging.info(f"Saving progress at document {idx} out of {total_docs}") + store.save_local(folder_name) + break + + # Save the vector store + if settings.VECTOR_STORE == "faiss": + store.save_local(folder_name) + logging.info("Vector store saved successfully.") diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py deleted file mode 100755 index 3109f583..00000000 --- a/application/parser/open_ai_func.py +++ /dev/null @@ -1,75 +0,0 @@ -import os - -from retry import retry - -from application.core.settings import settings - -from application.vectorstore.vector_creator import VectorCreator - - -# from langchain_community.embeddings import HuggingFaceEmbeddings -# from langchain_community.embeddings import HuggingFaceInstructEmbeddings -# from langchain_community.embeddings import CohereEmbeddings - - -@retry(tries=10, delay=60) -def store_add_texts_with_retry(store, i, id): - # add source_id to the metadata - i.metadata["source_id"] = str(id) - store.add_texts([i.page_content], metadatas=[i.metadata]) - # store_pine.add_texts([i.page_content], metadatas=[i.metadata]) - - -def call_openai_api(docs, folder_name, id, task_status): - # Function to create a vector store from the documents and save it to disk - - if not os.path.exists(f"{folder_name}"): - os.makedirs(f"{folder_name}") - - from tqdm import tqdm - - c1 = 0 - if settings.VECTOR_STORE == "faiss": - docs_init = [docs[0]] - docs.pop(0) - - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - docs_init=docs_init, - source_id=f"{folder_name}", - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - else: - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - source_id=str(id), - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - store.delete_index() - # Uncomment for MPNet embeddings - # model_name = "sentence-transformers/all-mpnet-base-v2" - # hf = HuggingFaceEmbeddings(model_name=model_name) - # store = FAISS.from_documents(docs_test, hf) - s1 = len(docs) - for i in tqdm( - docs, - desc="Embedding 🦖", - unit="docs", - total=len(docs), - bar_format="{l_bar}{bar}| Time Left: {remaining}", - ): - try: - task_status.update_state( - state="PROGRESS", meta={"current": int((c1 / s1) * 100)} - ) - store_add_texts_with_retry(store, i, id) - except Exception as e: - print(e) - print("Error on ", i) - print("Saving progress") - print(f"stopped at {c1} out of {len(docs)}") - store.save_local(f"{folder_name}") - break - c1 += 1 - if settings.VECTOR_STORE == "faiss": - store.save_local(f"{folder_name}") diff --git a/application/parser/token_func.py b/application/parser/token_func.py deleted file mode 100644 index 7511cde0..00000000 --- a/application/parser/token_func.py +++ /dev/null @@ -1,79 +0,0 @@ -import re -from math import ceil -from typing import List - -import tiktoken -from application.parser.schema.base import Document - - -def separate_header_and_body(text): - header_pattern = r"^(.*?\n){3}" - match = re.match(header_pattern, text) - header = match.group(0) - body = text[len(header):] - return header, body - - -def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: - docs = [] - current_group = None - - for doc in documents: - doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - - # Check if current group is empty or if the document can be added based on token count and matching metadata - if (current_group is None or - (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and - doc_len < min_tokens and - current_group.extra_info == doc.extra_info)): - if current_group is None: - current_group = doc # Use the document directly to retain its metadata - else: - current_group.text += " " + doc.text # Append text to the current group - else: - docs.append(current_group) - current_group = doc # Start a new group with the current document - - if current_group is not None: - docs.append(current_group) - - return docs - - -def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: - docs = [] - for doc in documents: - token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - if token_length <= max_tokens: - docs.append(doc) - else: - header, body = separate_header_and_body(doc.text) - if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens: - body = doc.text - header = "" - num_body_parts = ceil(token_length / max_tokens) - part_length = ceil(len(body) / num_body_parts) - body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] - for i, body_part in enumerate(body_parts): - new_doc = Document(text=header + body_part.strip(), - doc_id=f"{doc.doc_id}-{i}", - embedding=doc.embedding, - extra_info=doc.extra_info) - docs.append(new_doc) - return docs - - -def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): - if not token_check: - return documents - print("Grouping small documents") - try: - documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") - print("Separating large documents") - try: - documents = split_documents(documents=documents, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") - return documents diff --git a/application/worker.py b/application/worker.py index 33cd90e5..0edb46ff 100755 --- a/application/worker.py +++ b/application/worker.py @@ -12,10 +12,10 @@ from bson.objectid import ObjectId from application.core.mongo_db import MongoDB from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader -from application.parser.open_ai_func import call_openai_api +from application.parser.embedding_pipeline import embed_and_store_documents from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document -from application.parser.token_func import group_split +from application.parser.chunking import Chunker from application.utils import count_tokens_docs mongo = MongoDB.get_client() @@ -153,17 +153,19 @@ def ingest_worker( exclude_hidden=exclude, file_metadata=metadata_from_filename, ).load_data() - raw_docs = group_split( - documents=raw_docs, - min_tokens=MIN_TOKENS, + + chunker = Chunker( + chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, - token_check=token_check, + min_tokens=MIN_TOKENS, + duplicate_headers=False ) + raw_docs = chunker.chunk(documents=raw_docs) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] id = ObjectId() - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) @@ -217,21 +219,23 @@ def remote_worker( remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) - docs = group_split( - documents=raw_docs, - min_tokens=MIN_TOKENS, + chunker = Chunker( + chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, - token_check=token_check, + min_tokens=MIN_TOKENS, + duplicate_headers=False ) + docs = chunker.chunk(documents=raw_docs) + tokens = count_tokens_docs(docs) if operation_mode == "upload": id = ObjectId() - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) elif operation_mode == "sync": if not doc_id or not ObjectId.is_valid(doc_id): raise ValueError("doc_id must be provided for sync operation.") id = ObjectId(doc_id) - call_openai_api(docs, full_path, id, self) + embed_and_store_documents(docs, full_path, id, self) self.update_state(state="PROGRESS", meta={"current": 100}) file_data = { From 90962ee056df8d97d7d9284665d28fd0fe655ff8 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 17:41:13 +0000 Subject: [PATCH 04/26] fix: debugger in launch json --- .vscode/launch.json | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 5be1f711..5083d977 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,7 +13,7 @@ ] }, { - "name": "Python Debugger: Flask", + "name": "Flask Debugger", "type": "debugpy", "request": "launch", "module": "flask", @@ -32,5 +32,23 @@ ], "cwd": "${workspaceFolder}", }, + { + "name": "Celery Debugger", + "type": "debugpy", + "request": "launch", + "module": "celery", + "env": { + "PYTHONPATH": "${workspaceFolder}", + }, + "args": [ + "-A", + "application.app.celery", + "worker", + "-l", + "INFO", + "--pool=solo" + ], + "cwd": "${workspaceFolder}" + } ] } \ No newline at end of file From 41b4c28430ae29eded99b62e82e1ad4b863e99a3 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 17:41:44 +0000 Subject: [PATCH 05/26] fix: linting --- application/parser/chunking.py | 2 +- application/parser/embedding_pipeline.py | 4 ++-- application/worker.py | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/application/parser/chunking.py b/application/parser/chunking.py index 26f05dba..aae14898 100644 --- a/application/parser/chunking.py +++ b/application/parser/chunking.py @@ -1,5 +1,5 @@ import re -from typing import List, Tuple, Union +from typing import List, Tuple import logging from application.parser.schema.base import Document from application.utils import get_encoding diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 6cf40048..0435cd14 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -61,7 +61,7 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): # Process and embed documents for idx, doc in tqdm( - docs, + enumerate(docs), desc="Embedding 🦖", unit="docs", total=total_docs, @@ -69,7 +69,7 @@ def embed_and_store_documents(docs, folder_name, source_id, task_status): ): try: # Update task status for progress tracking - progress = int((idx / total_docs) * 100) + progress = int(((idx + 1) / total_docs) * 100) task_status.update_state(state="PROGRESS", meta={"current": progress}) # Add document to vector store diff --git a/application/worker.py b/application/worker.py index 0edb46ff..f4f181e5 100755 --- a/application/worker.py +++ b/application/worker.py @@ -126,7 +126,6 @@ def ingest_worker( limit = None exclude = True sample = False - token_check = True full_path = os.path.join(directory, user, name_job) logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job}) @@ -205,7 +204,6 @@ def remote_worker( operation_mode="upload", doc_id=None, ): - token_check = True full_path = os.path.join(directory, user, name_job) if not os.path.exists(full_path): From b2a013c02739d818fcd2f16513286815b7c04905 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 18:11:15 +0000 Subject: [PATCH 06/26] fix: remove reqs from scripts folder --- scripts/requirements.txt | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 scripts/requirements.txt diff --git a/scripts/requirements.txt b/scripts/requirements.txt deleted file mode 100644 index d90af2c3..00000000 --- a/scripts/requirements.txt +++ /dev/null @@ -1,22 +0,0 @@ -dataclasses_json==0.6.3 -docx2txt==0.8 -EbookLib==0.18 -escodegen==1.0.11 -esprima==4.0.1 -faiss_cpu==1.7.4 -html2text==2020.1.16 -javalang==0.13.0 -langchain==0.2.10 -langchain_community==0.2.9 -langchain-openai==0.0.5 -nltk==3.9 -openapi3_parser==1.1.16 -pandas==2.2.0 -PyPDF2==3.0.1 -python-dotenv==1.0.1 -retry==0.9.2 -Sphinx==7.2.6 -tiktoken==0.5.2 -tqdm==4.66.3 -typer==0.9.0 -unstructured==0.12.2 From 474298c969e0f08779dbaddf6bfae7a8b2dff261 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:12:13 +0000 Subject: [PATCH 07/26] build(deps): bump jinja2 from 3.1.4 to 3.1.5 in /application Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.4...3.1.5) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index b9d2c33c..015eb545 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -18,7 +18,7 @@ gTTS==2.3.2 gunicorn==23.0.0 html2text==2024.2.26 javalang==0.13.0 -jinja2==3.1.4 +jinja2==3.1.5 jiter==0.5.0 jmespath==1.0.1 joblib==1.4.2 From 7760e779aeee4cffceb4cc157ac75e32cfc8650a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:14:33 +0000 Subject: [PATCH 08/26] build(deps): bump i18next from 23.15.1 to 24.2.0 in /frontend Bumps [i18next](https://github.com/i18next/i18next) from 23.15.1 to 24.2.0. - [Release notes](https://github.com/i18next/i18next/releases) - [Changelog](https://github.com/i18next/i18next/blob/master/CHANGELOG.md) - [Commits](https://github.com/i18next/i18next/compare/v23.15.1...v24.2.0) --- updated-dependencies: - dependency-name: i18next dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 20 ++++++++++++++------ frontend/package.json | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index f96a17d4..4371d7c3 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -10,7 +10,7 @@ "dependencies": { "@reduxjs/toolkit": "^2.2.7", "chart.js": "^4.4.4", - "i18next": "^23.15.1", + "i18next": "^24.2.0", "i18next-browser-languagedetector": "^8.0.0", "prop-types": "^15.8.1", "react": "^18.2.0", @@ -1649,7 +1649,7 @@ "version": "18.3.0", "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.0.tgz", "integrity": "sha512-EhwApuTmMBmXuFOikhQLIBUn6uFg81SwLMOAUgodJF14SOBOCMdU04gDoYi0WOJJHD144TL32z4yDqCW3dnkQg==", - "dev": true, + "devOptional": true, "dependencies": { "@types/react": "*" } @@ -4921,9 +4921,9 @@ } }, "node_modules/i18next": { - "version": "23.15.1", - "resolved": "https://registry.npmjs.org/i18next/-/i18next-23.15.1.tgz", - "integrity": "sha512-wB4abZ3uK7EWodYisHl/asf8UYEhrI/vj/8aoSsrj/ZDxj4/UXPOa1KvFt1Fq5hkUHquNqwFlDprmjZ8iySgYA==", + "version": "24.2.0", + "resolved": "https://registry.npmjs.org/i18next/-/i18next-24.2.0.tgz", + "integrity": "sha512-ArJJTS1lV6lgKH7yEf4EpgNZ7+THl7bsGxxougPYiXRTJ/Fe1j08/TBpV9QsXCIYVfdE/HWG/xLezJ5DOlfBOA==", "funding": [ { "type": "individual", @@ -4940,6 +4940,14 @@ ], "dependencies": { "@babel/runtime": "^7.23.2" + }, + "peerDependencies": { + "typescript": "^5" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } } }, "node_modules/i18next-browser-languagedetector": { @@ -9250,7 +9258,7 @@ "version": "5.6.2", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz", "integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==", - "dev": true, + "devOptional": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" diff --git a/frontend/package.json b/frontend/package.json index 868a72ae..ca6ca518 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -21,7 +21,7 @@ "dependencies": { "@reduxjs/toolkit": "^2.2.7", "chart.js": "^4.4.4", - "i18next": "^23.15.1", + "i18next": "^24.2.0", "i18next-browser-languagedetector": "^8.0.0", "prop-types": "^15.8.1", "react": "^18.2.0", From 502d82e1c9e7f003b485e34f81d44221e875677e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:58:59 +0000 Subject: [PATCH 09/26] build(deps): bump langchain-openai from 0.2.0 to 0.2.14 in /application Bumps [langchain-openai](https://github.com/langchain-ai/langchain) from 0.2.0 to 0.2.14. - [Release notes](https://github.com/langchain-ai/langchain/releases) - [Commits](https://github.com/langchain-ai/langchain/compare/langchain-openai==0.2.0...langchain-openai==0.2.14) --- updated-dependencies: - dependency-name: langchain-openai dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index b9d2c33c..dfd1898e 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -31,7 +31,7 @@ kombu==5.4.2 langchain==0.3.11 langchain-community==0.3.11 langchain-core==0.3.25 -langchain-openai==0.2.0 +langchain-openai==0.2.14 langchain-text-splitters==0.3.0 langsmith==0.2.3 lazy-object-proxy==1.10.0 From 52dd3f798a59932661fdf374460c60ec94a430f5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:06:38 +0000 Subject: [PATCH 10/26] build(deps): bump jinja2 from 3.1.4 to 3.1.5 in /application Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.4...3.1.5) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index a8b909e5..e4e3f232 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -18,7 +18,7 @@ gTTS==2.3.2 gunicorn==23.0.0 html2text==2024.2.26 javalang==0.13.0 -jinja2==3.1.4 +jinja2==3.1.5 jiter==0.5.0 jmespath==1.0.1 joblib==1.4.2 From ab90a93eec9f7cd5b505b62356a3ab6391f8c1b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:15:46 +0000 Subject: [PATCH 11/26] build(deps): bump numpy from 1.26.4 to 2.2.1 in /application Bumps [numpy](https://github.com/numpy/numpy) from 1.26.4 to 2.2.1. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v1.26.4...v2.2.1) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index e4e3f232..57da7d9a 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -42,7 +42,7 @@ mpmath==1.3.0 multidict==6.1.0 mypy-extensions==1.0.0 networkx==3.3 -numpy==1.26.4 +numpy==2.2.1 openai==1.57.0 openapi-schema-validator==0.6.2 openapi-spec-validator==0.6.0 From fb2df05e3feb21ec763aa283facce70e45aad9ac Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 21:23:54 +0000 Subject: [PATCH 12/26] feat: upgrade python and bump faiss-cpu --- application/Dockerfile | 14 +++++++------- application/requirements.txt | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/application/Dockerfile b/application/Dockerfile index d076bc41..d422db45 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -8,14 +8,14 @@ RUN apt-get update && \ add-apt-repository ppa:deadsnakes/ppa && \ # Install necessary packages and Python apt-get update && \ - apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \ + apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.12 python3.12-venv && \ rm -rf /var/lib/apt/lists/* # Verify Python installation and setup symlink -RUN if [ -f /usr/bin/python3.11 ]; then \ - ln -s /usr/bin/python3.11 /usr/bin/python; \ +RUN if [ -f /usr/bin/python3.12 ]; then \ + ln -s /usr/bin/python3.12 /usr/bin/python; \ else \ - echo "Python 3.11 not found"; exit 1; \ + echo "Python 3.12 not found"; exit 1; \ fi # Download and unzip the model @@ -33,7 +33,7 @@ RUN apt-get remove --purge -y wget unzip && apt-get autoremove -y && rm -rf /var COPY requirements.txt . # Setup Python virtual environment -RUN python3.11 -m venv /venv +RUN python3.12 -m venv /venv # Activate virtual environment and install Python packages ENV PATH="/venv/bin:$PATH" @@ -50,8 +50,8 @@ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ # Install Python - apt-get update && apt-get install -y --no-install-recommends python3.11 && \ - ln -s /usr/bin/python3.11 /usr/bin/python && \ + apt-get update && apt-get install -y --no-install-recommends python3.12 && \ + ln -s /usr/bin/python3.12 /usr/bin/python && \ rm -rf /var/lib/apt/lists/* # Set working directory diff --git a/application/requirements.txt b/application/requirements.txt index 57da7d9a..787be450 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -12,7 +12,7 @@ escodegen==1.0.11 esprima==4.0.1 esutils==1.0.1 Flask==3.0.3 -faiss-cpu==1.8.0.post1 +faiss-cpu==1.9.0.post1 flask-restx==1.3.0 gTTS==2.3.2 gunicorn==23.0.0 From 4927b64d273a34d56f74cebf2bcfb1a25357de99 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 21:26:22 +0000 Subject: [PATCH 13/26] bump pytest --- .github/workflows/pytest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index cf68ff9c..d5b31109 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.11"] + python-version: ["3.12"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -23,7 +23,7 @@ jobs: run: | python -m pytest --cov=application --cov-report=xml - name: Upload coverage reports to Codecov - if: github.event_name == 'pull_request' && matrix.python-version == '3.11' + if: github.event_name == 'pull_request' && matrix.python-version == '3.12' uses: codecov/codecov-action@v5 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} From fdd2300517cec29836ec687d9dd49ee2715c3163 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:31:04 +0000 Subject: [PATCH 14/26] build(deps): bump langchain-openai from 0.2.0 to 0.2.14 in /application Bumps [langchain-openai](https://github.com/langchain-ai/langchain) from 0.2.0 to 0.2.14. - [Release notes](https://github.com/langchain-ai/langchain/releases) - [Commits](https://github.com/langchain-ai/langchain/compare/langchain-openai==0.2.0...langchain-openai==0.2.14) --- updated-dependencies: - dependency-name: langchain-openai dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 787be450..f189a300 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -31,7 +31,7 @@ kombu==5.4.2 langchain==0.3.11 langchain-community==0.3.11 langchain-core==0.3.25 -langchain-openai==0.2.0 +langchain-openai==0.2.14 langchain-text-splitters==0.3.0 langsmith==0.2.3 lazy-object-proxy==1.10.0 From 36e4398bcb2fef487c17ce0530e5d93b45023abc Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 21:39:33 +0000 Subject: [PATCH 15/26] fix: bump deps --- application/requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/application/requirements.txt b/application/requirements.txt index f189a300..da9d114e 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -28,11 +28,11 @@ jsonschema==4.23.0 jsonschema-spec==0.2.4 jsonschema-specifications==2023.7.1 kombu==5.4.2 -langchain==0.3.11 -langchain-community==0.3.11 -langchain-core==0.3.25 +langchain==0.3.13 +langchain-community==0.3.13 +langchain-core==0.3.28 langchain-openai==0.2.14 -langchain-text-splitters==0.3.0 +langchain-text-splitters==0.3.4 langsmith==0.2.3 lazy-object-proxy==1.10.0 lxml==5.3.0 @@ -43,7 +43,7 @@ multidict==6.1.0 mypy-extensions==1.0.0 networkx==3.3 numpy==2.2.1 -openai==1.57.0 +openai==1.58.1 openapi-schema-validator==0.6.2 openapi-spec-validator==0.6.0 openapi3-parser==1.1.18 From 2536bd098826a90b9f6582ccf58a98c73ae88e02 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:44:39 +0000 Subject: [PATCH 16/26] build(deps): bump elasticsearch from 8.15.1 to 8.17.0 in /application Bumps [elasticsearch](https://github.com/elastic/elasticsearch-py) from 8.15.1 to 8.17.0. - [Release notes](https://github.com/elastic/elasticsearch-py/releases) - [Commits](https://github.com/elastic/elasticsearch-py/compare/v8.15.1...v8.17.0) --- updated-dependencies: - dependency-name: elasticsearch dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index da9d114e..20e90232 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -7,7 +7,7 @@ docx2txt==0.8 duckduckgo-search==6.3.0 ebooklib==0.18 elastic-transport==8.15.0 -elasticsearch==8.15.1 +elasticsearch==8.17.0 escodegen==1.0.11 esprima==4.0.1 esutils==1.0.1 From e30291966a9761aa39f46296ccb4699bcaf44185 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Dec 2024 21:47:31 +0000 Subject: [PATCH 17/26] fix: bump elastic transport --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 20e90232..08990ab2 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -6,7 +6,7 @@ dataclasses-json==0.6.7 docx2txt==0.8 duckduckgo-search==6.3.0 ebooklib==0.18 -elastic-transport==8.15.0 +elastic-transport==8.15.1 elasticsearch==8.17.0 escodegen==1.0.11 esprima==4.0.1 From ba9e2101bbf81629af4093df36d36b85b3995d0a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:52:27 +0000 Subject: [PATCH 18/26] build(deps-dev): bump postcss from 8.4.47 to 8.4.49 in /frontend Bumps [postcss](https://github.com/postcss/postcss) from 8.4.47 to 8.4.49. - [Release notes](https://github.com/postcss/postcss/releases) - [Changelog](https://github.com/postcss/postcss/blob/main/CHANGELOG.md) - [Commits](https://github.com/postcss/postcss/compare/8.4.47...8.4.49) --- updated-dependencies: - dependency-name: postcss dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 10 +++++----- frontend/package.json | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 4371d7c3..4d6759da 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -48,7 +48,7 @@ "eslint-plugin-unused-imports": "^4.1.4", "husky": "^8.0.0", "lint-staged": "^15.2.10", - "postcss": "^8.4.41", + "postcss": "^8.4.49", "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", "tailwindcss": "^3.4.15", @@ -7466,9 +7466,9 @@ } }, "node_modules/postcss": { - "version": "8.4.47", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.47.tgz", - "integrity": "sha512-56rxCq7G/XfB4EkXq9Egn5GCqugWvDFjafDOThIdMBsI15iqPqR5r15TfSr1YPYeEI19YeaXMCbY6u88Y76GLQ==", + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", "dev": true, "funding": [ { @@ -7486,7 +7486,7 @@ ], "dependencies": { "nanoid": "^3.3.7", - "picocolors": "^1.1.0", + "picocolors": "^1.1.1", "source-map-js": "^1.2.1" }, "engines": { diff --git a/frontend/package.json b/frontend/package.json index ca6ca518..220f9759 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -59,7 +59,7 @@ "eslint-plugin-unused-imports": "^4.1.4", "husky": "^8.0.0", "lint-staged": "^15.2.10", - "postcss": "^8.4.41", + "postcss": "^8.4.49", "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", "tailwindcss": "^3.4.15", From 636ac2a56c9b1974132fc24e77f0b785dc2b95ad Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 21:58:40 +0000 Subject: [PATCH 19/26] build(deps-dev): bump typescript from 5.6.2 to 5.7.2 in /frontend Bumps [typescript](https://github.com/microsoft/TypeScript) from 5.6.2 to 5.7.2. - [Release notes](https://github.com/microsoft/TypeScript/releases) - [Changelog](https://github.com/microsoft/TypeScript/blob/main/azure-pipelines.release.yml) - [Commits](https://github.com/microsoft/TypeScript/compare/v5.6.2...v5.7.2) --- updated-dependencies: - dependency-name: typescript dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 8 ++++---- frontend/package.json | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 4d6759da..d6ea061e 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -52,7 +52,7 @@ "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", "tailwindcss": "^3.4.15", - "typescript": "^5.6.2", + "typescript": "^5.7.2", "vite": "^5.4.11", "vite-plugin-svgr": "^4.2.0" } @@ -9255,9 +9255,9 @@ } }, "node_modules/typescript": { - "version": "5.6.2", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz", - "integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==", + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.2.tgz", + "integrity": "sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==", "devOptional": true, "bin": { "tsc": "bin/tsc", diff --git a/frontend/package.json b/frontend/package.json index 220f9759..83237b07 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -63,7 +63,7 @@ "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", "tailwindcss": "^3.4.15", - "typescript": "^5.6.2", + "typescript": "^5.7.2", "vite": "^5.4.11", "vite-plugin-svgr": "^4.2.0" } From 868ea1a1e24e0691a88917a3ff6299bf057b1de5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 22:03:05 +0000 Subject: [PATCH 20/26] build(deps-dev): bump lint-staged from 15.2.10 to 15.2.11 in /frontend Bumps [lint-staged](https://github.com/lint-staged/lint-staged) from 15.2.10 to 15.2.11. - [Release notes](https://github.com/lint-staged/lint-staged/releases) - [Changelog](https://github.com/lint-staged/lint-staged/blob/master/CHANGELOG.md) - [Commits](https://github.com/lint-staged/lint-staged/compare/v15.2.10...v15.2.11) --- updated-dependencies: - dependency-name: lint-staged dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 138 ++++++++++++++----------------------- frontend/package.json | 2 +- 2 files changed, 52 insertions(+), 88 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index d6ea061e..1fc37133 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -47,7 +47,7 @@ "eslint-plugin-react": "^7.37.2", "eslint-plugin-unused-imports": "^4.1.4", "husky": "^8.0.0", - "lint-staged": "^15.2.10", + "lint-staged": "^15.2.11", "postcss": "^8.4.49", "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", @@ -864,18 +864,6 @@ "url": "https://github.com/chalk/ansi-regex?sponsor=1" } }, - "node_modules/@isaacs/cliui/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/@isaacs/cliui/node_modules/emoji-regex": { "version": "9.2.2", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", @@ -2152,6 +2140,18 @@ "node": ">=8" } }, + "node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "dev": true, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/any-promise": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", @@ -2902,11 +2902,11 @@ } }, "node_modules/debug": { - "version": "4.3.6", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.6.tgz", - "integrity": "sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg==", + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz", + "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==", "dependencies": { - "ms": "2.1.2" + "ms": "^2.1.3" }, "engines": { "node": ">=6.0" @@ -3068,9 +3068,9 @@ "dev": true }, "node_modules/emoji-regex": { - "version": "10.3.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.3.0.tgz", - "integrity": "sha512-QpLs9D9v9kArv4lfDEgg1X/gN5XLnf/A6l9cs8SPZLRZR3ZkY9+kwIQTxm+fsSej5UMYGE8fdoaZVIBlqG0XTw==", + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", + "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", "dev": true }, "node_modules/entities": { @@ -4304,9 +4304,9 @@ } }, "node_modules/get-east-asian-width": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.2.0.tgz", - "integrity": "sha512-2nk+7SIVb14QrgXFHcm84tD4bKQz0RxPuMT8Ag5KPOq7J5fEmAg0UbXdTOSHqNuHSU28k55qnceesxXRZGzKWA==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz", + "integrity": "sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==", "dev": true, "engines": { "node": ">=18" @@ -5671,21 +5671,21 @@ "dev": true }, "node_modules/lint-staged": { - "version": "15.2.10", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.10.tgz", - "integrity": "sha512-5dY5t743e1byO19P9I4b3x8HJwalIznL5E1FWYnU6OWw33KxNBSLAc6Cy7F2PsFEO8FKnLwjwm5hx7aMF0jzZg==", + "version": "15.2.11", + "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.11.tgz", + "integrity": "sha512-Ev6ivCTYRTGs9ychvpVw35m/bcNDuBN+mnTeObCL5h+boS5WzBEC6LHI4I9F/++sZm1m+J2LEiy0gxL/R9TBqQ==", "dev": true, "dependencies": { "chalk": "~5.3.0", "commander": "~12.1.0", - "debug": "~4.3.6", + "debug": "~4.4.0", "execa": "~8.0.1", - "lilconfig": "~3.1.2", - "listr2": "~8.2.4", + "lilconfig": "~3.1.3", + "listr2": "~8.2.5", "micromatch": "~4.0.8", "pidtree": "~0.6.0", "string-argv": "~0.3.2", - "yaml": "~2.5.0" + "yaml": "~2.6.1" }, "bin": { "lint-staged": "bin/lint-staged.js" @@ -5710,9 +5710,9 @@ } }, "node_modules/lint-staged/node_modules/lilconfig": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", - "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", + "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", "dev": true, "engines": { "node": ">=14" @@ -5722,9 +5722,9 @@ } }, "node_modules/listr2": { - "version": "8.2.4", - "resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.4.tgz", - "integrity": "sha512-opevsywziHd3zHCVQGAj8zu+Z3yHNkkoYhWIGnq54RrCVwLz0MozotJEDnKsIBLvkfLGN6BLOyAeRrYI0pKA4g==", + "version": "8.2.5", + "resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.5.tgz", + "integrity": "sha512-iyAZCeyD+c1gPyE9qpFu8af0Y+MRtmKOncdGoA2S5EY8iFq99dmmvkNnHiWo+pj0s7yH7l3KPIgee77tKpXPWQ==", "dev": true, "dependencies": { "cli-truncate": "^4.0.0", @@ -5779,9 +5779,9 @@ } }, "node_modules/log-update/node_modules/ansi-regex": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz", - "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", "dev": true, "engines": { "node": ">=12" @@ -5790,18 +5790,6 @@ "url": "https://github.com/chalk/ansi-regex?sponsor=1" } }, - "node_modules/log-update/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/log-update/node_modules/is-fullwidth-code-point": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz", @@ -6968,9 +6956,9 @@ } }, "node_modules/ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" }, "node_modules/mz": { "version": "2.7.0", @@ -8576,18 +8564,6 @@ "url": "https://github.com/chalk/slice-ansi?sponsor=1" } }, - "node_modules/slice-ansi/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/snake-case": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/snake-case/-/snake-case-3.0.4.tgz", @@ -8679,9 +8655,9 @@ } }, "node_modules/string-width/node_modules/ansi-regex": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz", - "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", "dev": true, "engines": { "node": ">=12" @@ -9775,9 +9751,9 @@ } }, "node_modules/wrap-ansi/node_modules/ansi-regex": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz", - "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", "dev": true, "engines": { "node": ">=12" @@ -9786,18 +9762,6 @@ "url": "https://github.com/chalk/ansi-regex?sponsor=1" } }, - "node_modules/wrap-ansi/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "dev": true, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/wrap-ansi/node_modules/strip-ansi": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", @@ -9834,9 +9798,9 @@ "dev": true }, "node_modules/yaml": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.5.1.tgz", - "integrity": "sha512-bLQOjaX/ADgQ20isPJRvF0iRUHIxVhYvr53Of7wGcWlO2jvtUlH5m87DsmulFVxRpNLOnI4tB6p/oh8D7kpn9Q==", + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz", + "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==", "dev": true, "bin": { "yaml": "bin.mjs" diff --git a/frontend/package.json b/frontend/package.json index 83237b07..0547d68d 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -58,7 +58,7 @@ "eslint-plugin-react": "^7.37.2", "eslint-plugin-unused-imports": "^4.1.4", "husky": "^8.0.0", - "lint-staged": "^15.2.10", + "lint-staged": "^15.2.11", "postcss": "^8.4.49", "prettier": "^3.3.3", "prettier-plugin-tailwindcss": "^0.6.8", From 753832d701cc127887adf933b8a6f48ab0818a9d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 22:06:40 +0000 Subject: [PATCH 21/26] build(deps): bump react-router-dom from 6.8.1 to 7.1.1 in /frontend Bumps [react-router-dom](https://github.com/remix-run/react-router/tree/HEAD/packages/react-router-dom) from 6.8.1 to 7.1.1. - [Release notes](https://github.com/remix-run/react-router/releases) - [Changelog](https://github.com/remix-run/react-router/blob/main/packages/react-router-dom/CHANGELOG.md) - [Commits](https://github.com/remix-run/react-router/commits/react-router-dom@7.1.1/packages/react-router-dom) --- updated-dependencies: - dependency-name: react-router-dom dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 69 +++++++++++++++++++++++++------------- frontend/package.json | 2 +- 2 files changed, 47 insertions(+), 24 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 1fc37133..9a3dbf31 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -22,7 +22,7 @@ "react-i18next": "^15.0.2", "react-markdown": "^9.0.1", "react-redux": "^8.0.5", - "react-router-dom": "^6.8.1", + "react-router-dom": "^7.1.1", "react-syntax-highlighter": "^15.5.0", "rehype-katex": "^7.0.1", "remark-gfm": "^4.0.0", @@ -1051,14 +1051,6 @@ } } }, - "node_modules/@remix-run/router": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.3.2.tgz", - "integrity": "sha512-t54ONhl/h75X94SWsHGQ4G/ZrCEguKSRQr7DrjTciJXW0YU1QhlwYeycvK5JgkzlxmvrK7wq1NB/PLtHxoiDcA==", - "engines": { - "node": ">=14" - } - }, "node_modules/@rollup/pluginutils": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-5.1.0.tgz", @@ -1549,6 +1541,11 @@ "@babel/types": "^7.20.7" } }, + "node_modules/@types/cookie": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz", + "integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==" + }, "node_modules/@types/debug": { "version": "4.1.12", "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", @@ -2784,6 +2781,14 @@ "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", "dev": true }, + "node_modules/cookie": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.0.2.tgz", + "integrity": "sha512-9Kr/j4O16ISv8zBBhJoi4bXOYNTkFLOqSL3UDB0njXxCXNezjeyVrJyGOWtgfs/q2km1gwBcfH8q1yEGoMYunA==", + "engines": { + "node": ">=18" + } + }, "node_modules/copy-to-clipboard": { "version": "3.3.3", "resolved": "https://registry.npmjs.org/copy-to-clipboard/-/copy-to-clipboard-3.3.3.tgz", @@ -7972,33 +7977,41 @@ } }, "node_modules/react-router": { - "version": "6.8.1", - "resolved": "https://registry.npmjs.org/react-router/-/react-router-6.8.1.tgz", - "integrity": "sha512-Jgi8BzAJQ8MkPt8ipXnR73rnD7EmZ0HFFb7jdQU24TynGW1Ooqin2KVDN9voSC+7xhqbbCd2cjGUepb6RObnyg==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/react-router/-/react-router-7.1.1.tgz", + "integrity": "sha512-39sXJkftkKWRZ2oJtHhCxmoCrBCULr/HAH4IT5DHlgu/Q0FCPV0S4Lx+abjDTx/74xoZzNYDYbOZWlJjruyuDQ==", "dependencies": { - "@remix-run/router": "1.3.2" + "@types/cookie": "^0.6.0", + "cookie": "^1.0.1", + "set-cookie-parser": "^2.6.0", + "turbo-stream": "2.4.0" }, "engines": { - "node": ">=14" + "node": ">=20.0.0" }, "peerDependencies": { - "react": ">=16.8" + "react": ">=18", + "react-dom": ">=18" + }, + "peerDependenciesMeta": { + "react-dom": { + "optional": true + } } }, "node_modules/react-router-dom": { - "version": "6.8.1", - "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.8.1.tgz", - "integrity": "sha512-67EXNfkQgf34P7+PSb6VlBuaacGhkKn3kpE51+P6zYSG2kiRoumXEL6e27zTa9+PGF2MNXbgIUHTVlleLbIcHQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-7.1.1.tgz", + "integrity": "sha512-vSrQHWlJ5DCfyrhgo0k6zViOe9ToK8uT5XGSmnuC2R3/g261IdIMpZVqfjD6vWSXdnf5Czs4VA/V60oVR6/jnA==", "dependencies": { - "@remix-run/router": "1.3.2", - "react-router": "6.8.1" + "react-router": "7.1.1" }, "engines": { - "node": ">=14" + "node": ">=20.0.0" }, "peerDependencies": { - "react": ">=16.8", - "react-dom": ">=16.8" + "react": ">=18", + "react-dom": ">=18" } }, "node_modules/react-side-effect": { @@ -8456,6 +8469,11 @@ "semver": "bin/semver.js" } }, + "node_modules/set-cookie-parser": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz", + "integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==" + }, "node_modules/set-function-length": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", @@ -9133,6 +9151,11 @@ "typescript": ">=2.8.0 || >= 3.2.0-dev || >= 3.3.0-dev || >= 3.4.0-dev || >= 3.5.0-dev || >= 3.6.0-dev || >= 3.6.0-beta || >= 3.7.0-dev || >= 3.7.0-beta" } }, + "node_modules/turbo-stream": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/turbo-stream/-/turbo-stream-2.4.0.tgz", + "integrity": "sha512-FHncC10WpBd2eOmGwpmQsWLDoK4cqsA/UT/GqNoaKOQnT8uzhtCbg3EoUDMvqpOSAI0S26mr0rkjzbOO6S3v1g==" + }, "node_modules/type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index 0547d68d..ff98e94c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -33,7 +33,7 @@ "react-i18next": "^15.0.2", "react-markdown": "^9.0.1", "react-redux": "^8.0.5", - "react-router-dom": "^6.8.1", + "react-router-dom": "^7.1.1", "react-syntax-highlighter": "^15.5.0", "rehype-katex": "^7.0.1", "remark-gfm": "^4.0.0", From e45648b389544fa59f24f65ea1d58efc58c35ef4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Dec 2024 20:17:15 +0000 Subject: [PATCH 22/26] build(deps): bump langsmith from 0.2.3 to 0.2.6 in /application Bumps [langsmith](https://github.com/langchain-ai/langsmith-sdk) from 0.2.3 to 0.2.6. - [Release notes](https://github.com/langchain-ai/langsmith-sdk/releases) - [Commits](https://github.com/langchain-ai/langsmith-sdk/compare/v0.2.3...v0.2.6) --- updated-dependencies: - dependency-name: langsmith dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 08990ab2..754fb271 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -33,7 +33,7 @@ langchain-community==0.3.13 langchain-core==0.3.28 langchain-openai==0.2.14 langchain-text-splitters==0.3.4 -langsmith==0.2.3 +langsmith==0.2.6 lazy-object-proxy==1.10.0 lxml==5.3.0 markupsafe==2.1.5 From e42fc97d03e88a31f676ad8daffc912b6210e7da Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Dec 2024 20:17:25 +0000 Subject: [PATCH 23/26] build(deps): bump celery from 5.3.6 to 5.4.0 in /application Bumps [celery](https://github.com/celery/celery) from 5.3.6 to 5.4.0. - [Release notes](https://github.com/celery/celery/releases) - [Changelog](https://github.com/celery/celery/blob/main/Changelog.rst) - [Commits](https://github.com/celery/celery/compare/v5.3.6...v5.4.0) --- updated-dependencies: - dependency-name: celery dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 08990ab2..41297624 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -1,7 +1,7 @@ anthropic==0.40.0 boto3==1.34.153 beautifulsoup4==4.12.3 -celery==5.3.6 +celery==5.4.0 dataclasses-json==0.6.7 docx2txt==0.8 duckduckgo-search==6.3.0 From 0f611eb87bf038154afb20b31350027eecf23ae0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Dec 2024 14:52:21 +0000 Subject: [PATCH 24/26] build(deps): bump redis from 5.0.1 to 5.2.1 in /application Bumps [redis](https://github.com/redis/redis-py) from 5.0.1 to 5.2.1. - [Release notes](https://github.com/redis/redis-py/releases) - [Changelog](https://github.com/redis/redis-py/blob/master/CHANGES) - [Commits](https://github.com/redis/redis-py/compare/v5.0.1...v5.2.1) --- updated-dependencies: - dependency-name: redis dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index b7660c42..6a318338 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -68,7 +68,7 @@ python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-pptx==1.0.2 qdrant-client==1.11.0 -redis==5.0.1 +redis==5.2.1 referencing==0.30.2 regex==2024.9.11 requests==2.32.3 From 3daeab5186e41289656b5a5d71b86c38a526a62e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Dec 2024 14:57:29 +0000 Subject: [PATCH 25/26] build(deps): bump tiktoken from 0.7.0 to 0.8.0 in /application Bumps [tiktoken](https://github.com/openai/tiktoken) from 0.7.0 to 0.8.0. - [Release notes](https://github.com/openai/tiktoken/releases) - [Changelog](https://github.com/openai/tiktoken/blob/main/CHANGELOG.md) - [Commits](https://github.com/openai/tiktoken/compare/0.7.0...0.8.0) --- updated-dependencies: - dependency-name: tiktoken dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 6a318338..7c9f8101 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -74,7 +74,7 @@ regex==2024.9.11 requests==2.32.3 retry==0.9.2 sentence-transformers==3.3.1 -tiktoken==0.7.0 +tiktoken==0.8.0 tokenizers==0.21.0 torch==2.4.1 tqdm==4.66.5 From efb018d2b068a88dd99b8b32bb08f106517294e4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Dec 2024 14:58:26 +0000 Subject: [PATCH 26/26] build(deps): bump marshmallow from 3.22.0 to 3.23.2 in /application Bumps [marshmallow](https://github.com/marshmallow-code/marshmallow) from 3.22.0 to 3.23.2. - [Changelog](https://github.com/marshmallow-code/marshmallow/blob/dev/CHANGELOG.rst) - [Commits](https://github.com/marshmallow-code/marshmallow/compare/3.22.0...3.23.2) --- updated-dependencies: - dependency-name: marshmallow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- application/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/requirements.txt b/application/requirements.txt index 7c9f8101..362fdd45 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -37,7 +37,7 @@ langsmith==0.2.6 lazy-object-proxy==1.10.0 lxml==5.3.0 markupsafe==2.1.5 -marshmallow==3.22.0 +marshmallow==3.23.2 mpmath==1.3.0 multidict==6.1.0 mypy-extensions==1.0.0