Merge branch 'arc53:main' into main

This commit is contained in:
Manish Madan
2025-01-01 15:14:40 +05:30
committed by GitHub
12 changed files with 391 additions and 348 deletions

View File

@@ -6,7 +6,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
python-version: ["3.12"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
@@ -23,7 +23,7 @@ jobs:
run: |
python -m pytest --cov=application --cov-report=xml
- name: Upload coverage reports to Codecov
if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
if: github.event_name == 'pull_request' && matrix.python-version == '3.12'
uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

20
.vscode/launch.json vendored
View File

@@ -13,7 +13,7 @@
]
},
{
"name": "Python Debugger: Flask",
"name": "Flask Debugger",
"type": "debugpy",
"request": "launch",
"module": "flask",
@@ -32,5 +32,23 @@
],
"cwd": "${workspaceFolder}",
},
{
"name": "Celery Debugger",
"type": "debugpy",
"request": "launch",
"module": "celery",
"env": {
"PYTHONPATH": "${workspaceFolder}",
},
"args": [
"-A",
"application.app.celery",
"worker",
"-l",
"INFO",
"--pool=solo"
],
"cwd": "${workspaceFolder}"
}
]
}

View File

@@ -8,14 +8,14 @@ RUN apt-get update && \
add-apt-repository ppa:deadsnakes/ppa && \
# Install necessary packages and Python
apt-get update && \
apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.11 python3.11-distutils python3.11-venv && \
apt-get install -y --no-install-recommends gcc wget unzip libc6-dev python3.12 python3.12-venv && \
rm -rf /var/lib/apt/lists/*
# Verify Python installation and setup symlink
RUN if [ -f /usr/bin/python3.11 ]; then \
ln -s /usr/bin/python3.11 /usr/bin/python; \
RUN if [ -f /usr/bin/python3.12 ]; then \
ln -s /usr/bin/python3.12 /usr/bin/python; \
else \
echo "Python 3.11 not found"; exit 1; \
echo "Python 3.12 not found"; exit 1; \
fi
# Download and unzip the model
@@ -33,7 +33,7 @@ RUN apt-get remove --purge -y wget unzip && apt-get autoremove -y && rm -rf /var
COPY requirements.txt .
# Setup Python virtual environment
RUN python3.11 -m venv /venv
RUN python3.12 -m venv /venv
# Activate virtual environment and install Python packages
ENV PATH="/venv/bin:$PATH"
@@ -50,8 +50,8 @@ RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
# Install Python
apt-get update && apt-get install -y --no-install-recommends python3.11 && \
ln -s /usr/bin/python3.11 /usr/bin/python && \
apt-get update && apt-get install -y --no-install-recommends python3.12 && \
ln -s /usr/bin/python3.12 /usr/bin/python && \
rm -rf /var/lib/apt/lists/*
# Set working directory

View File

@@ -0,0 +1,118 @@
import re
from typing import List, Tuple
import logging
from application.parser.schema.base import Document
from application.utils import get_encoding
logger = logging.getLogger(__name__)
class Chunker:
def __init__(
self,
chunking_strategy: str = "classic_chunk",
max_tokens: int = 2000,
min_tokens: int = 150,
duplicate_headers: bool = False,
):
if chunking_strategy not in ["classic_chunk"]:
raise ValueError(f"Unsupported chunking strategy: {chunking_strategy}")
self.chunking_strategy = chunking_strategy
self.max_tokens = max_tokens
self.min_tokens = min_tokens
self.duplicate_headers = duplicate_headers
self.encoding = get_encoding()
def separate_header_and_body(self, text: str) -> Tuple[str, str]:
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
if match:
header = match.group(0)
body = text[len(header):]
else:
header, body = "", text # No header, treat entire text as body
return header, body
def combine_documents(self, doc: Document, next_doc: Document) -> Document:
combined_text = doc.text + " " + next_doc.text
combined_token_count = len(self.encoding.encode(combined_text))
new_doc = Document(
text=combined_text,
doc_id=doc.doc_id,
embedding=doc.embedding,
extra_info={**(doc.extra_info or {}), "token_count": combined_token_count}
)
return new_doc
def split_document(self, doc: Document) -> List[Document]:
split_docs = []
header, body = self.separate_header_and_body(doc.text)
header_tokens = self.encoding.encode(header) if header else []
body_tokens = self.encoding.encode(body)
current_position = 0
part_index = 0
while current_position < len(body_tokens):
end_position = current_position + self.max_tokens - len(header_tokens)
chunk_tokens = (header_tokens + body_tokens[current_position:end_position]
if self.duplicate_headers or part_index == 0 else body_tokens[current_position:end_position])
chunk_text = self.encoding.decode(chunk_tokens)
new_doc = Document(
text=chunk_text,
doc_id=f"{doc.doc_id}-{part_index}",
embedding=doc.embedding,
extra_info={**(doc.extra_info or {}), "token_count": len(chunk_tokens)}
)
split_docs.append(new_doc)
current_position = end_position
part_index += 1
header_tokens = []
return split_docs
def classic_chunk(self, documents: List[Document]) -> List[Document]:
processed_docs = []
i = 0
while i < len(documents):
doc = documents[i]
tokens = self.encoding.encode(doc.text)
token_count = len(tokens)
if self.min_tokens <= token_count <= self.max_tokens:
doc.extra_info = doc.extra_info or {}
doc.extra_info["token_count"] = token_count
processed_docs.append(doc)
i += 1
elif token_count < self.min_tokens:
if i + 1 < len(documents):
next_doc = documents[i + 1]
next_tokens = self.encoding.encode(next_doc.text)
if token_count + len(next_tokens) <= self.max_tokens:
# Combine small documents
combined_doc = self.combine_documents(doc, next_doc)
processed_docs.append(combined_doc)
i += 2
else:
# Keep the small document as is if adding next_doc would exceed max_tokens
doc.extra_info = doc.extra_info or {}
doc.extra_info["token_count"] = token_count
processed_docs.append(doc)
i += 1
else:
# No next document to combine with; add the small document as is
doc.extra_info = doc.extra_info or {}
doc.extra_info["token_count"] = token_count
processed_docs.append(doc)
i += 1
else:
# Split large documents
processed_docs.extend(self.split_document(doc))
i += 1
return processed_docs
def chunk(
self,
documents: List[Document]
) -> List[Document]:
if self.chunking_strategy == "classic_chunk":
return self.classic_chunk(documents)
else:
raise ValueError("Unsupported chunking strategy")

View File

@@ -0,0 +1,86 @@
import os
import logging
from retry import retry
from tqdm import tqdm
from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator
@retry(tries=10, delay=60)
def add_text_to_store_with_retry(store, doc, source_id):
"""
Add a document's text and metadata to the vector store with retry logic.
Args:
store: The vector store object.
doc: The document to be added.
source_id: Unique identifier for the source.
"""
try:
doc.metadata["source_id"] = str(source_id)
store.add_texts([doc.page_content], metadatas=[doc.metadata])
except Exception as e:
logging.error(f"Failed to add document with retry: {e}")
raise
def embed_and_store_documents(docs, folder_name, source_id, task_status):
"""
Embeds documents and stores them in a vector store.
Args:
docs (list): List of documents to be embedded and stored.
folder_name (str): Directory to save the vector store.
source_id (str): Unique identifier for the source.
task_status: Task state manager for progress updates.
Returns:
None
"""
# Ensure the folder exists
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# Initialize vector store
if settings.VECTOR_STORE == "faiss":
docs_init = [docs.pop(0)]
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
docs_init=docs_init,
source_id=folder_name,
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
else:
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
source_id=source_id,
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
store.delete_index()
total_docs = len(docs)
# Process and embed documents
for idx, doc in tqdm(
enumerate(docs),
desc="Embedding 🦖",
unit="docs",
total=total_docs,
bar_format="{l_bar}{bar}| Time Left: {remaining}",
):
try:
# Update task status for progress tracking
progress = int(((idx + 1) / total_docs) * 100)
task_status.update_state(state="PROGRESS", meta={"current": progress})
# Add document to vector store
add_text_to_store_with_retry(store, doc, source_id)
except Exception as e:
logging.error(f"Error embedding document {idx}: {e}")
logging.info(f"Saving progress at document {idx} out of {total_docs}")
store.save_local(folder_name)
break
# Save the vector store
if settings.VECTOR_STORE == "faiss":
store.save_local(folder_name)
logging.info("Vector store saved successfully.")

View File

@@ -1,75 +0,0 @@
import os
from retry import retry
from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.embeddings import HuggingFaceInstructEmbeddings
# from langchain_community.embeddings import CohereEmbeddings
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i, id):
# add source_id to the metadata
i.metadata["source_id"] = str(id)
store.add_texts([i.page_content], metadatas=[i.metadata])
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
def call_openai_api(docs, folder_name, id, task_status):
# Function to create a vector store from the documents and save it to disk
if not os.path.exists(f"{folder_name}"):
os.makedirs(f"{folder_name}")
from tqdm import tqdm
c1 = 0
if settings.VECTOR_STORE == "faiss":
docs_init = [docs[0]]
docs.pop(0)
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
docs_init=docs_init,
source_id=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
else:
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
source_id=str(id),
embeddings_key=os.getenv("EMBEDDINGS_KEY"),
)
store.delete_index()
# Uncomment for MPNet embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
# hf = HuggingFaceEmbeddings(model_name=model_name)
# store = FAISS.from_documents(docs_test, hf)
s1 = len(docs)
for i in tqdm(
docs,
desc="Embedding 🦖",
unit="docs",
total=len(docs),
bar_format="{l_bar}{bar}| Time Left: {remaining}",
):
try:
task_status.update_state(
state="PROGRESS", meta={"current": int((c1 / s1) * 100)}
)
store_add_texts_with_retry(store, i, id)
except Exception as e:
print(e)
print("Error on ", i)
print("Saving progress")
print(f"stopped at {c1} out of {len(docs)}")
store.save_local(f"{folder_name}")
break
c1 += 1
if settings.VECTOR_STORE == "faiss":
store.save_local(f"{folder_name}")

View File

@@ -1,79 +0,0 @@
import re
from math import ceil
from typing import List
import tiktoken
from application.parser.schema.base import Document
def separate_header_and_body(text):
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
header = match.group(0)
body = text[len(header):]
return header, body
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
docs = []
current_group = None
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
# Check if current group is empty or if the document can be added based on token count and matching metadata
if (current_group is None or
(len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and
doc_len < min_tokens and
current_group.extra_info == doc.extra_info)):
if current_group is None:
current_group = doc # Use the document directly to retain its metadata
else:
current_group.text += " " + doc.text # Append text to the current group
else:
docs.append(current_group)
current_group = doc # Start a new group with the current document
if current_group is not None:
docs.append(current_group)
return docs
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
docs = []
for doc in documents:
token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if token_length <= max_tokens:
docs.append(doc)
else:
header, body = separate_header_and_body(doc.text)
if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens:
body = doc.text
header = ""
num_body_parts = ceil(token_length / max_tokens)
part_length = ceil(len(body) / num_body_parts)
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
for i, body_part in enumerate(body_parts):
new_doc = Document(text=header + body_part.strip(),
doc_id=f"{doc.doc_id}-{i}",
embedding=doc.embedding,
extra_info=doc.extra_info)
docs.append(new_doc)
return docs
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if not token_check:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except Exception:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except Exception:
print("Grouping failed, try running without token_check")
return documents

View File

@@ -1,24 +1,24 @@
anthropic==0.40.0
boto3==1.34.153
beautifulsoup4==4.12.3
celery==5.3.6
celery==5.4.0
dataclasses-json==0.6.7
docx2txt==0.8
duckduckgo-search==6.3.0
ebooklib==0.18
elastic-transport==8.15.0
elasticsearch==8.15.1
elastic-transport==8.15.1
elasticsearch==8.17.0
escodegen==1.0.11
esprima==4.0.1
esutils==1.0.1
Flask==3.0.3
faiss-cpu==1.8.0.post1
faiss-cpu==1.9.0.post1
flask-restx==1.3.0
gTTS==2.3.2
gunicorn==23.0.0
html2text==2024.2.26
javalang==0.13.0
jinja2==3.1.4
jinja2==3.1.5
jiter==0.5.0
jmespath==1.0.1
joblib==1.4.2
@@ -28,22 +28,22 @@ jsonschema==4.23.0
jsonschema-spec==0.2.4
jsonschema-specifications==2023.7.1
kombu==5.4.2
langchain==0.3.11
langchain-community==0.3.11
langchain-core==0.3.25
langchain-openai==0.2.0
langchain-text-splitters==0.3.0
langsmith==0.2.3
langchain==0.3.13
langchain-community==0.3.13
langchain-core==0.3.28
langchain-openai==0.2.14
langchain-text-splitters==0.3.4
langsmith==0.2.6
lazy-object-proxy==1.10.0
lxml==5.3.0
markupsafe==2.1.5
marshmallow==3.22.0
marshmallow==3.23.2
mpmath==1.3.0
multidict==6.1.0
mypy-extensions==1.0.0
networkx==3.3
numpy==1.26.4
openai==1.57.0
numpy==2.2.1
openai==1.58.1
openapi-schema-validator==0.6.2
openapi-spec-validator==0.6.0
openapi3-parser==1.1.18
@@ -68,13 +68,13 @@ python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-pptx==1.0.2
qdrant-client==1.11.0
redis==5.0.1
redis==5.2.1
referencing==0.30.2
regex==2024.9.11
requests==2.32.3
retry==0.9.2
sentence-transformers==3.3.1
tiktoken==0.7.0
tiktoken==0.8.0
tokenizers==0.21.0
torch==2.4.1
tqdm==4.66.5
@@ -86,4 +86,4 @@ urllib3==2.2.3
vine==5.1.0
wcwidth==0.2.13
werkzeug==3.1.3
yarl==1.11.1
yarl==1.18.3

View File

@@ -12,10 +12,10 @@ from bson.objectid import ObjectId
from application.core.mongo_db import MongoDB
from application.core.settings import settings
from application.parser.file.bulk import SimpleDirectoryReader
from application.parser.open_ai_func import call_openai_api
from application.parser.embedding_pipeline import embed_and_store_documents
from application.parser.remote.remote_creator import RemoteCreator
from application.parser.schema.base import Document
from application.parser.token_func import group_split
from application.parser.chunking import Chunker
from application.utils import count_tokens_docs
mongo = MongoDB.get_client()
@@ -126,7 +126,6 @@ def ingest_worker(
limit = None
exclude = True
sample = False
token_check = True
full_path = os.path.join(directory, user, name_job)
logging.info(f"Ingest file: {full_path}", extra={"user": user, "job": name_job})
@@ -153,17 +152,19 @@ def ingest_worker(
exclude_hidden=exclude,
file_metadata=metadata_from_filename,
).load_data()
raw_docs = group_split(
documents=raw_docs,
min_tokens=MIN_TOKENS,
chunker = Chunker(
chunking_strategy="classic_chunk",
max_tokens=MAX_TOKENS,
token_check=token_check,
min_tokens=MIN_TOKENS,
duplicate_headers=False
)
raw_docs = chunker.chunk(documents=raw_docs)
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
id = ObjectId()
call_openai_api(docs, full_path, id, self)
embed_and_store_documents(docs, full_path, id, self)
tokens = count_tokens_docs(docs)
self.update_state(state="PROGRESS", meta={"current": 100})
@@ -203,7 +204,6 @@ def remote_worker(
operation_mode="upload",
doc_id=None,
):
token_check = True
full_path = os.path.join(directory, user, name_job)
if not os.path.exists(full_path):
@@ -217,21 +217,23 @@ def remote_worker(
remote_loader = RemoteCreator.create_loader(loader)
raw_docs = remote_loader.load_data(source_data)
docs = group_split(
documents=raw_docs,
min_tokens=MIN_TOKENS,
chunker = Chunker(
chunking_strategy="classic_chunk",
max_tokens=MAX_TOKENS,
token_check=token_check,
min_tokens=MIN_TOKENS,
duplicate_headers=False
)
docs = chunker.chunk(documents=raw_docs)
tokens = count_tokens_docs(docs)
if operation_mode == "upload":
id = ObjectId()
call_openai_api(docs, full_path, id, self)
embed_and_store_documents(docs, full_path, id, self)
elif operation_mode == "sync":
if not doc_id or not ObjectId.is_valid(doc_id):
raise ValueError("doc_id must be provided for sync operation.")
id = ObjectId(doc_id)
call_openai_api(docs, full_path, id, self)
embed_and_store_documents(docs, full_path, id, self)
self.update_state(state="PROGRESS", meta={"current": 100})
file_data = {

View File

@@ -10,7 +10,7 @@
"dependencies": {
"@reduxjs/toolkit": "^2.2.7",
"chart.js": "^4.4.4",
"i18next": "^23.15.1",
"i18next": "^24.2.0",
"i18next-browser-languagedetector": "^8.0.0",
"prop-types": "^15.8.1",
"react": "^18.2.0",
@@ -22,7 +22,7 @@
"react-i18next": "^15.0.2",
"react-markdown": "^9.0.1",
"react-redux": "^8.0.5",
"react-router-dom": "^6.8.1",
"react-router-dom": "^7.1.1",
"react-syntax-highlighter": "^15.5.0",
"rehype-katex": "^7.0.1",
"remark-gfm": "^4.0.0",
@@ -47,12 +47,12 @@
"eslint-plugin-react": "^7.37.2",
"eslint-plugin-unused-imports": "^4.1.4",
"husky": "^8.0.0",
"lint-staged": "^15.2.10",
"postcss": "^8.4.41",
"lint-staged": "^15.2.11",
"postcss": "^8.4.49",
"prettier": "^3.3.3",
"prettier-plugin-tailwindcss": "^0.6.8",
"tailwindcss": "^3.4.15",
"typescript": "^5.6.2",
"typescript": "^5.7.2",
"vite": "^5.4.11",
"vite-plugin-svgr": "^4.2.0"
}
@@ -864,18 +864,6 @@
"url": "https://github.com/chalk/ansi-regex?sponsor=1"
}
},
"node_modules/@isaacs/cliui/node_modules/ansi-styles": {
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
"integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
"dev": true,
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/@isaacs/cliui/node_modules/emoji-regex": {
"version": "9.2.2",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
@@ -1063,14 +1051,6 @@
}
}
},
"node_modules/@remix-run/router": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.3.2.tgz",
"integrity": "sha512-t54ONhl/h75X94SWsHGQ4G/ZrCEguKSRQr7DrjTciJXW0YU1QhlwYeycvK5JgkzlxmvrK7wq1NB/PLtHxoiDcA==",
"engines": {
"node": ">=14"
}
},
"node_modules/@rollup/pluginutils": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-5.1.0.tgz",
@@ -1561,6 +1541,11 @@
"@babel/types": "^7.20.7"
}
},
"node_modules/@types/cookie": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz",
"integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA=="
},
"node_modules/@types/debug": {
"version": "4.1.12",
"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
@@ -1649,7 +1634,7 @@
"version": "18.3.0",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.0.tgz",
"integrity": "sha512-EhwApuTmMBmXuFOikhQLIBUn6uFg81SwLMOAUgodJF14SOBOCMdU04gDoYi0WOJJHD144TL32z4yDqCW3dnkQg==",
"dev": true,
"devOptional": true,
"dependencies": {
"@types/react": "*"
}
@@ -2152,6 +2137,18 @@
"node": ">=8"
}
},
"node_modules/ansi-styles": {
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
"integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
"dev": true,
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/any-promise": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz",
@@ -2784,6 +2781,14 @@
"integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
"dev": true
},
"node_modules/cookie": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-1.0.2.tgz",
"integrity": "sha512-9Kr/j4O16ISv8zBBhJoi4bXOYNTkFLOqSL3UDB0njXxCXNezjeyVrJyGOWtgfs/q2km1gwBcfH8q1yEGoMYunA==",
"engines": {
"node": ">=18"
}
},
"node_modules/copy-to-clipboard": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/copy-to-clipboard/-/copy-to-clipboard-3.3.3.tgz",
@@ -2902,11 +2907,11 @@
}
},
"node_modules/debug": {
"version": "4.3.6",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.6.tgz",
"integrity": "sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg==",
"version": "4.4.0",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz",
"integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==",
"dependencies": {
"ms": "2.1.2"
"ms": "^2.1.3"
},
"engines": {
"node": ">=6.0"
@@ -3068,9 +3073,9 @@
"dev": true
},
"node_modules/emoji-regex": {
"version": "10.3.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.3.0.tgz",
"integrity": "sha512-QpLs9D9v9kArv4lfDEgg1X/gN5XLnf/A6l9cs8SPZLRZR3ZkY9+kwIQTxm+fsSej5UMYGE8fdoaZVIBlqG0XTw==",
"version": "10.4.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz",
"integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==",
"dev": true
},
"node_modules/entities": {
@@ -4304,9 +4309,9 @@
}
},
"node_modules/get-east-asian-width": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.2.0.tgz",
"integrity": "sha512-2nk+7SIVb14QrgXFHcm84tD4bKQz0RxPuMT8Ag5KPOq7J5fEmAg0UbXdTOSHqNuHSU28k55qnceesxXRZGzKWA==",
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz",
"integrity": "sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==",
"dev": true,
"engines": {
"node": ">=18"
@@ -4921,9 +4926,9 @@
}
},
"node_modules/i18next": {
"version": "23.15.1",
"resolved": "https://registry.npmjs.org/i18next/-/i18next-23.15.1.tgz",
"integrity": "sha512-wB4abZ3uK7EWodYisHl/asf8UYEhrI/vj/8aoSsrj/ZDxj4/UXPOa1KvFt1Fq5hkUHquNqwFlDprmjZ8iySgYA==",
"version": "24.2.0",
"resolved": "https://registry.npmjs.org/i18next/-/i18next-24.2.0.tgz",
"integrity": "sha512-ArJJTS1lV6lgKH7yEf4EpgNZ7+THl7bsGxxougPYiXRTJ/Fe1j08/TBpV9QsXCIYVfdE/HWG/xLezJ5DOlfBOA==",
"funding": [
{
"type": "individual",
@@ -4940,6 +4945,14 @@
],
"dependencies": {
"@babel/runtime": "^7.23.2"
},
"peerDependencies": {
"typescript": "^5"
},
"peerDependenciesMeta": {
"typescript": {
"optional": true
}
}
},
"node_modules/i18next-browser-languagedetector": {
@@ -5663,21 +5676,21 @@
"dev": true
},
"node_modules/lint-staged": {
"version": "15.2.10",
"resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.10.tgz",
"integrity": "sha512-5dY5t743e1byO19P9I4b3x8HJwalIznL5E1FWYnU6OWw33KxNBSLAc6Cy7F2PsFEO8FKnLwjwm5hx7aMF0jzZg==",
"version": "15.2.11",
"resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.2.11.tgz",
"integrity": "sha512-Ev6ivCTYRTGs9ychvpVw35m/bcNDuBN+mnTeObCL5h+boS5WzBEC6LHI4I9F/++sZm1m+J2LEiy0gxL/R9TBqQ==",
"dev": true,
"dependencies": {
"chalk": "~5.3.0",
"commander": "~12.1.0",
"debug": "~4.3.6",
"debug": "~4.4.0",
"execa": "~8.0.1",
"lilconfig": "~3.1.2",
"listr2": "~8.2.4",
"lilconfig": "~3.1.3",
"listr2": "~8.2.5",
"micromatch": "~4.0.8",
"pidtree": "~0.6.0",
"string-argv": "~0.3.2",
"yaml": "~2.5.0"
"yaml": "~2.6.1"
},
"bin": {
"lint-staged": "bin/lint-staged.js"
@@ -5702,9 +5715,9 @@
}
},
"node_modules/lint-staged/node_modules/lilconfig": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz",
"integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==",
"version": "3.1.3",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz",
"integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==",
"dev": true,
"engines": {
"node": ">=14"
@@ -5714,9 +5727,9 @@
}
},
"node_modules/listr2": {
"version": "8.2.4",
"resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.4.tgz",
"integrity": "sha512-opevsywziHd3zHCVQGAj8zu+Z3yHNkkoYhWIGnq54RrCVwLz0MozotJEDnKsIBLvkfLGN6BLOyAeRrYI0pKA4g==",
"version": "8.2.5",
"resolved": "https://registry.npmjs.org/listr2/-/listr2-8.2.5.tgz",
"integrity": "sha512-iyAZCeyD+c1gPyE9qpFu8af0Y+MRtmKOncdGoA2S5EY8iFq99dmmvkNnHiWo+pj0s7yH7l3KPIgee77tKpXPWQ==",
"dev": true,
"dependencies": {
"cli-truncate": "^4.0.0",
@@ -5771,9 +5784,9 @@
}
},
"node_modules/log-update/node_modules/ansi-regex": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
"integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
"integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
"dev": true,
"engines": {
"node": ">=12"
@@ -5782,18 +5795,6 @@
"url": "https://github.com/chalk/ansi-regex?sponsor=1"
}
},
"node_modules/log-update/node_modules/ansi-styles": {
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
"integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
"dev": true,
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/log-update/node_modules/is-fullwidth-code-point": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz",
@@ -6960,9 +6961,9 @@
}
},
"node_modules/ms": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
"integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
},
"node_modules/mz": {
"version": "2.7.0",
@@ -7458,9 +7459,9 @@
}
},
"node_modules/postcss": {
"version": "8.4.47",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.47.tgz",
"integrity": "sha512-56rxCq7G/XfB4EkXq9Egn5GCqugWvDFjafDOThIdMBsI15iqPqR5r15TfSr1YPYeEI19YeaXMCbY6u88Y76GLQ==",
"version": "8.4.49",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz",
"integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==",
"dev": true,
"funding": [
{
@@ -7478,7 +7479,7 @@
],
"dependencies": {
"nanoid": "^3.3.7",
"picocolors": "^1.1.0",
"picocolors": "^1.1.1",
"source-map-js": "^1.2.1"
},
"engines": {
@@ -7976,33 +7977,41 @@
}
},
"node_modules/react-router": {
"version": "6.8.1",
"resolved": "https://registry.npmjs.org/react-router/-/react-router-6.8.1.tgz",
"integrity": "sha512-Jgi8BzAJQ8MkPt8ipXnR73rnD7EmZ0HFFb7jdQU24TynGW1Ooqin2KVDN9voSC+7xhqbbCd2cjGUepb6RObnyg==",
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/react-router/-/react-router-7.1.1.tgz",
"integrity": "sha512-39sXJkftkKWRZ2oJtHhCxmoCrBCULr/HAH4IT5DHlgu/Q0FCPV0S4Lx+abjDTx/74xoZzNYDYbOZWlJjruyuDQ==",
"dependencies": {
"@remix-run/router": "1.3.2"
"@types/cookie": "^0.6.0",
"cookie": "^1.0.1",
"set-cookie-parser": "^2.6.0",
"turbo-stream": "2.4.0"
},
"engines": {
"node": ">=14"
"node": ">=20.0.0"
},
"peerDependencies": {
"react": ">=16.8"
"react": ">=18",
"react-dom": ">=18"
},
"peerDependenciesMeta": {
"react-dom": {
"optional": true
}
}
},
"node_modules/react-router-dom": {
"version": "6.8.1",
"resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.8.1.tgz",
"integrity": "sha512-67EXNfkQgf34P7+PSb6VlBuaacGhkKn3kpE51+P6zYSG2kiRoumXEL6e27zTa9+PGF2MNXbgIUHTVlleLbIcHQ==",
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-7.1.1.tgz",
"integrity": "sha512-vSrQHWlJ5DCfyrhgo0k6zViOe9ToK8uT5XGSmnuC2R3/g261IdIMpZVqfjD6vWSXdnf5Czs4VA/V60oVR6/jnA==",
"dependencies": {
"@remix-run/router": "1.3.2",
"react-router": "6.8.1"
"react-router": "7.1.1"
},
"engines": {
"node": ">=14"
"node": ">=20.0.0"
},
"peerDependencies": {
"react": ">=16.8",
"react-dom": ">=16.8"
"react": ">=18",
"react-dom": ">=18"
}
},
"node_modules/react-side-effect": {
@@ -8460,6 +8469,11 @@
"semver": "bin/semver.js"
}
},
"node_modules/set-cookie-parser": {
"version": "2.7.1",
"resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz",
"integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ=="
},
"node_modules/set-function-length": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
@@ -8568,18 +8582,6 @@
"url": "https://github.com/chalk/slice-ansi?sponsor=1"
}
},
"node_modules/slice-ansi/node_modules/ansi-styles": {
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
"integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
"dev": true,
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/snake-case": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/snake-case/-/snake-case-3.0.4.tgz",
@@ -8671,9 +8673,9 @@
}
},
"node_modules/string-width/node_modules/ansi-regex": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
"integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
"integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
"dev": true,
"engines": {
"node": ">=12"
@@ -9149,6 +9151,11 @@
"typescript": ">=2.8.0 || >= 3.2.0-dev || >= 3.3.0-dev || >= 3.4.0-dev || >= 3.5.0-dev || >= 3.6.0-dev || >= 3.6.0-beta || >= 3.7.0-dev || >= 3.7.0-beta"
}
},
"node_modules/turbo-stream": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/turbo-stream/-/turbo-stream-2.4.0.tgz",
"integrity": "sha512-FHncC10WpBd2eOmGwpmQsWLDoK4cqsA/UT/GqNoaKOQnT8uzhtCbg3EoUDMvqpOSAI0S26mr0rkjzbOO6S3v1g=="
},
"node_modules/type-check": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
@@ -9247,10 +9254,10 @@
}
},
"node_modules/typescript": {
"version": "5.6.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.2.tgz",
"integrity": "sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==",
"dev": true,
"version": "5.7.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.2.tgz",
"integrity": "sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==",
"devOptional": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
@@ -9767,9 +9774,9 @@
}
},
"node_modules/wrap-ansi/node_modules/ansi-regex": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
"integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
"integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
"dev": true,
"engines": {
"node": ">=12"
@@ -9778,18 +9785,6 @@
"url": "https://github.com/chalk/ansi-regex?sponsor=1"
}
},
"node_modules/wrap-ansi/node_modules/ansi-styles": {
"version": "6.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
"integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
"dev": true,
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/wrap-ansi/node_modules/strip-ansi": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
@@ -9826,9 +9821,9 @@
"dev": true
},
"node_modules/yaml": {
"version": "2.5.1",
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.5.1.tgz",
"integrity": "sha512-bLQOjaX/ADgQ20isPJRvF0iRUHIxVhYvr53Of7wGcWlO2jvtUlH5m87DsmulFVxRpNLOnI4tB6p/oh8D7kpn9Q==",
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz",
"integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==",
"dev": true,
"bin": {
"yaml": "bin.mjs"

View File

@@ -21,7 +21,7 @@
"dependencies": {
"@reduxjs/toolkit": "^2.2.7",
"chart.js": "^4.4.4",
"i18next": "^23.15.1",
"i18next": "^24.2.0",
"i18next-browser-languagedetector": "^8.0.0",
"prop-types": "^15.8.1",
"react": "^18.2.0",
@@ -33,7 +33,7 @@
"react-i18next": "^15.0.2",
"react-markdown": "^9.0.1",
"react-redux": "^8.0.5",
"react-router-dom": "^6.8.1",
"react-router-dom": "^7.1.1",
"react-syntax-highlighter": "^15.5.0",
"rehype-katex": "^7.0.1",
"remark-gfm": "^4.0.0",
@@ -58,12 +58,12 @@
"eslint-plugin-react": "^7.37.2",
"eslint-plugin-unused-imports": "^4.1.4",
"husky": "^8.0.0",
"lint-staged": "^15.2.10",
"postcss": "^8.4.41",
"lint-staged": "^15.2.11",
"postcss": "^8.4.49",
"prettier": "^3.3.3",
"prettier-plugin-tailwindcss": "^0.6.8",
"tailwindcss": "^3.4.15",
"typescript": "^5.6.2",
"typescript": "^5.7.2",
"vite": "^5.4.11",
"vite-plugin-svgr": "^4.2.0"
}

View File

@@ -1,22 +0,0 @@
dataclasses_json==0.6.3
docx2txt==0.8
EbookLib==0.18
escodegen==1.0.11
esprima==4.0.1
faiss_cpu==1.7.4
html2text==2020.1.16
javalang==0.13.0
langchain==0.2.10
langchain_community==0.2.9
langchain-openai==0.0.5
nltk==3.9
openapi3_parser==1.1.16
pandas==2.2.0
PyPDF2==3.0.1
python-dotenv==1.0.1
retry==0.9.2
Sphinx==7.2.6
tiktoken==0.5.2
tqdm==4.66.3
typer==0.9.0
unstructured==0.12.2