fix packaging and imports and introduce tests with pytest.

still issues with celery worker.
This commit is contained in:
Anton Larin
2023-08-13 19:25:55 +02:00
parent 9a393b4f74
commit 98a97f34f5
23 changed files with 107 additions and 37 deletions

View File

@@ -0,0 +1 @@

View File

@@ -3,7 +3,7 @@ from abc import abstractmethod
from typing import Any, List
from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document
from application.parser.schema.base import Document
class BaseReader:

View File

@@ -3,15 +3,15 @@ import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
from parser.file.epub_parser import EpubParser
from parser.file.html_parser import HTMLParser
from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from application.parser.file.base import BaseReader
from application.parser.file.base_parser import BaseParser
from application.parser.file.docs_parser import DocxParser, PDFParser
from application.parser.file.epub_parser import EpubParser
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser
from application.parser.schema.base import Document
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),

View File

@@ -6,7 +6,7 @@ Contains parsers for docx, pdf files.
from pathlib import Path
from typing import Dict
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class PDFParser(BaseParser):

View File

@@ -6,7 +6,7 @@ Contains parsers for epub files.
from pathlib import Path
from typing import Dict
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class EpubParser(BaseParser):

View File

@@ -7,7 +7,7 @@ import re
from pathlib import Path
from typing import Dict, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class HTMLParser(BaseParser):

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
import tiktoken
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser):

View File

@@ -7,7 +7,7 @@ import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class RstParser(BaseParser):

View File

@@ -6,7 +6,7 @@ Contains parsers for tabular data files.
from pathlib import Path
from typing import Any, Dict, List, Union
from parser.file.base_parser import BaseParser
from application.parser.file.base_parser import BaseParser
class CSVParser(BaseParser):

View File

@@ -0,0 +1 @@

View File

@@ -2,7 +2,7 @@
from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument
from application.parser.schema.schema import BaseDocument
@dataclass

View File

@@ -3,7 +3,7 @@ from math import ceil
from typing import List
import tiktoken
from parser.schema.base import Document
from application.parser.schema.base import Document
def separate_header_and_body(text):