* validate python formatting on every build with Ruff
* fix lint warnings
This commit is contained in:
Anton Larin
2023-05-13 10:36:17 +02:00
committed by GitHub
parent 168648e789
commit 962becb9a5
35 changed files with 277 additions and 252 deletions

View File

@@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document

View File

@@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
@@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
Chunks.append([])
Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be a user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0

View File

@@ -7,8 +7,8 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):

View File

@@ -5,10 +5,9 @@ Contains parser for md files.
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser):

View File

@@ -1,4 +1,5 @@
import os
import javalang

View File

@@ -1,6 +1,7 @@
import os
import esprima
import escodegen
import esprima
def find_files(directory):
@@ -27,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body:
if subnode.type == 'MethodDefinition':
func_name = subnode.key.name

View File

@@ -1,16 +1,15 @@
import os
import faiss
import pickle
import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings
from retry import retry
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.

View File

@@ -1,7 +1,8 @@
import os
import ast
import tiktoken
import os
from pathlib import Path
import tiktoken
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

View File

@@ -2,7 +2,6 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument

View File

@@ -1,9 +1,9 @@
import re
import tiktoken
from typing import List
from parser.schema.base import Document
from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
def separate_header_and_body(text):
@@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False:
if not token_check:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except:
except Exception:
print("Grouping failed, try running without token_check")
return documents