parser functions change

token_func proposed change to chunking. open_ai_func proposed change to embedding_pipeline. Late chunking first implementation requires further testing.
2025-11-29 16:43:16 +00:00 · 2024-11-20 21:40:57 +04:00
3 changed files with 298 additions and 0 deletions
--- a/application/parser/chunking.py
+++ b/application/parser/chunking.py
@@ -0,0 +1,118 @@
+import re
+from typing import List, Tuple, Union
+import logging
+from application.parser.schema.base import Document
+from application.utils import get_encoding
+
+logger = logging.getLogger(__name__)
+
+class Chunker:
+    def __init__(
+        self,
+        chunking_strategy: str = "classic_chunk",
+        max_tokens: int = 2000,
+        min_tokens: int = 150,
+        duplicate_headers: bool = False,
+    ):
+        if chunking_strategy not in ["classic_chunk"]:
+            raise ValueError(f"Unsupported chunking strategy: {chunking_strategy}")
+        self.chunking_strategy = chunking_strategy
+        self.max_tokens = max_tokens
+        self.min_tokens = min_tokens
+        self.duplicate_headers = duplicate_headers
+        self.encoding = get_encoding()
+
+    def separate_header_and_body(self, text: str) -> Tuple[str, str]:
+        header_pattern = r"^(.*?\n){3}"
+        match = re.match(header_pattern, text)
+        if match:
+            header = match.group(0)
+            body = text[len(header):]
+        else:
+            header, body = "", text  # No header, treat entire text as body
+        return header, body
+
+    def combine_documents(self, doc: Document, next_doc: Document) -> Document:
+        combined_text = doc.text + " " + next_doc.text
+        combined_token_count = len(self.encoding.encode(combined_text))
+        new_doc = Document(
+            text=combined_text,
+            doc_id=doc.doc_id,
+            embedding=doc.embedding,
+            extra_info={**(doc.extra_info or {}), "token_count": combined_token_count}
+        )
+        return new_doc
+    
+    def split_document(self, doc: Document) -> List[Document]:
+        split_docs = []
+        header, body = self.separate_header_and_body(doc.text)
+        header_tokens = self.encoding.encode(header) if header else []
+        body_tokens = self.encoding.encode(body)
+
+        current_position = 0
+        part_index = 0
+        while current_position < len(body_tokens):
+            end_position = current_position + self.max_tokens - len(header_tokens)
+            chunk_tokens = (header_tokens + body_tokens[current_position:end_position]
+                            if self.duplicate_headers or part_index == 0 else body_tokens[current_position:end_position])
+            chunk_text = self.encoding.decode(chunk_tokens)
+            new_doc = Document(
+                text=chunk_text,
+                doc_id=f"{doc.doc_id}-{part_index}",
+                embedding=doc.embedding,
+                extra_info={**(doc.extra_info or {}), "token_count": len(chunk_tokens)}
+            )
+            split_docs.append(new_doc)
+            current_position = end_position
+            part_index += 1
+            header_tokens = []
+        return split_docs
+
+    def classic_chunk(self, documents: List[Document]) -> List[Document]:
+        processed_docs = []
+        i = 0
+        while i < len(documents):
+            doc = documents[i]
+            tokens = self.encoding.encode(doc.text)
+            token_count = len(tokens)
+
+            if self.min_tokens <= token_count <= self.max_tokens:
+                doc.extra_info = doc.extra_info or {}
+                doc.extra_info["token_count"] = token_count
+                processed_docs.append(doc)
+                i += 1
+            elif token_count < self.min_tokens:
+                if i + 1 < len(documents):
+                    next_doc = documents[i + 1]
+                    next_tokens = self.encoding.encode(next_doc.text)
+                    if token_count + len(next_tokens) <= self.max_tokens:
+                        # Combine small documents
+                        combined_doc = self.combine_documents(doc, next_doc)
+                        processed_docs.append(combined_doc)
+                        i += 2
+                    else:
+                        # Keep the small document as is if adding next_doc would exceed max_tokens
+                        doc.extra_info = doc.extra_info or {}
+                        doc.extra_info["token_count"] = token_count
+                        processed_docs.append(doc)
+                        i += 1
+                else:
+                    # No next document to combine with; add the small document as is
+                    doc.extra_info = doc.extra_info or {}
+                    doc.extra_info["token_count"] = token_count
+                    processed_docs.append(doc)
+                    i += 1
+            else:
+                # Split large documents
+                processed_docs.extend(self.split_document(doc))
+                i += 1
+        return processed_docs
+
+    def chunk(
+        self,
+        documents: List[Document]
+    ) -> List[Document]:
+        if self.chunking_strategy == "classic_chunk":
+            return self.classic_chunk(documents)
+        else:
+            raise ValueError("Unsupported chunking strategy")
--- a/application/parser/embedding_pipeline.py
+++ b/application/parser/embedding_pipeline.py
@@ -0,0 +1,86 @@
+import os
+import logging
+from retry import retry
+from tqdm import tqdm
+from application.core.settings import settings
+from application.vectorstore.vector_creator import VectorCreator
+
+
+@retry(tries=10, delay=60)
+def add_text_to_store_with_retry(store, doc, source_id):
+    """
+    Add a document's text and metadata to the vector store with retry logic.
+    Args:
+        store: The vector store object.
+        doc: The document to be added.
+        source_id: Unique identifier for the source.
+    """
+    try:
+        doc.metadata["source_id"] = str(source_id)
+        store.add_texts([doc.page_content], metadatas=[doc.metadata])
+    except Exception as e:
+        logging.error(f"Failed to add document with retry: {e}")
+        raise
+
+
+def embed_and_store_documents(docs, folder_name, source_id, task_status):
+    """
+    Embeds documents and stores them in a vector store.
+
+    Args:
+        docs (list): List of documents to be embedded and stored.
+        folder_name (str): Directory to save the vector store.
+        source_id (str): Unique identifier for the source.
+        task_status: Task state manager for progress updates.
+
+    Returns:
+        None
+    """
+    # Ensure the folder exists
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+
+    # Initialize vector store
+    if settings.VECTOR_STORE == "faiss":
+        docs_init = [docs.pop(0)]
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            docs_init=docs_init,
+            source_id=folder_name,
+            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
+        )
+    else:
+        store = VectorCreator.create_vectorstore(
+            settings.VECTOR_STORE,
+            source_id=source_id,
+            embeddings_key=os.getenv("EMBEDDINGS_KEY"),
+        )
+        store.delete_index()
+
+    total_docs = len(docs)
+
+    # Process and embed documents
+    for idx, doc in tqdm(
+        docs,
+        desc="Embedding 🦖",
+        unit="docs",
+        total=total_docs,
+        bar_format="{l_bar}{bar}| Time Left: {remaining}",
+    ):
+        try:
+            # Update task status for progress tracking
+            progress = int((idx / total_docs) * 100)
+            task_status.update_state(state="PROGRESS", meta={"current": progress})
+
+            # Add document to vector store
+            add_text_to_store_with_retry(store, doc, source_id)
+        except Exception as e:
+            logging.error(f"Error embedding document {idx}: {e}")
+            logging.info(f"Saving progress at document {idx} out of {total_docs}")
+            store.save_local(folder_name)
+            break
+
+    # Save the vector store
+    if settings.VECTOR_STORE == "faiss":
+        store.save_local(folder_name)
+    logging.info("Vector store saved successfully.")
--- a/application/parser/late_chunking.py
+++ b/application/parser/late_chunking.py
@@ -0,0 +1,94 @@
+from typing import List, Tuple, Union, Optional
+from transformers import AutoTokenizer, AutoModel
+from sentence_transformers import SentenceTransformer
+import torch
+import torch.nn as nn
+from application.parser.schema.base import Document
+
+
+class LateChunker:
+    def __init__(self, model_name: str, late_tokens: int = 1000, **model_kwargs):
+        """
+        Initialize the LateChunker with a model, tokenizer, and late_tokens limit.
+        Supports both transformers and sentence-transformers models.
+        """
+        self.late_tokens = late_tokens
+        self.model_name = model_name
+
+        # Load model based on type
+        if "sentence-transformers" in model_name:
+            self.model = SentenceTransformer(model_name, **model_kwargs)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+            self.wrapper_type = "sentence_transformers"
+        else:
+            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, **model_kwargs)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+            self.wrapper_type = "transformers"
+
+    def tokenize_with_offsets(self, text: str):
+        """Tokenize text and return tokens with character offsets."""
+        tokens = self.tokenizer.encode_plus(
+            text, return_offsets_mapping=True, add_special_tokens=False
+        )
+        return tokens["input_ids"], tokens["offset_mapping"]
+
+    def late_chunk_with_embeddings(
+        self, documents: List[Document]
+    ) -> List[Tuple[str, List[Tuple[int, int]], List[float]]]:
+        """
+        Combines documents into 'super chunks' that fit within `late_tokens` limit.
+        Outputs each super chunk with span annotations and embeddings.
+        """
+        super_chunks = []
+        current_super_chunk_text = []
+        current_token_count = 0
+        span_annotations = []
+
+        for doc in documents:
+            doc_text = doc.text
+            input_ids, offsets = self.tokenize_with_offsets(doc_text)
+            doc_token_count = len(input_ids)
+
+            # Check if adding this document exceeds the late_tokens limit
+            if current_token_count + doc_token_count > self.late_tokens:
+                # Finalize the current super chunk
+                combined_text = " ".join(current_super_chunk_text)
+                embeddings = self.generate_embeddings(combined_text)
+
+                super_chunks.append((combined_text, span_annotations, embeddings))
+
+                # Reset for a new super chunk
+                current_super_chunk_text = []
+                span_annotations = []
+                current_token_count = 0
+
+            # Add document to the current super chunk
+            start_token = current_token_count
+            end_token = current_token_count + doc_token_count
+            span_annotations.append((start_token, end_token))
+            current_super_chunk_text.append(doc_text)
+            current_token_count = end_token
+
+        # Add the final super chunk if there are remaining documents
+        if current_super_chunk_text:
+            combined_text = " ".join(current_super_chunk_text)
+            embeddings = self.generate_embeddings(combined_text)
+            super_chunks.append((combined_text, span_annotations, embeddings))
+
+        return super_chunks
+
+    def generate_embeddings(self, text: str) -> List[float]:
+        """Generate embeddings for a given text using the loaded model."""
+        if self.wrapper_type == "sentence_transformers":
+            # Sentence-Transformers
+            embeddings = self.model.encode([text])
+            return embeddings[0].tolist()
+
+        elif self.wrapper_type == "transformers":
+            # Transformers models
+            inputs = self.tokenizer(text, return_tensors="pt")
+            model_output = self.model(**inputs)
+            return model_output.last_hidden_state.mean(dim=1).squeeze().tolist()
+
+        else:
+            raise ValueError("Unsupported model type for embedding generation.")