diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 38492c7c..7511f3df 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -6,6 +6,21 @@ from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator +def sanitize_content(content: str) -> str: + """ + Remove NUL characters that can cause vector store ingestion to fail. + + Args: + content (str): Raw content that may contain NUL characters + + Returns: + str: Sanitized content with NUL characters removed + """ + if not content: + return content + return content.replace('\x00', '') + + @retry(tries=10, delay=60) def add_text_to_store_with_retry(store, doc, source_id): """ @@ -16,6 +31,9 @@ def add_text_to_store_with_retry(store, doc, source_id): source_id: Unique identifier for the source. """ try: + # Sanitize content to remove NUL characters that cause ingestion failures + doc.page_content = sanitize_content(doc.page_content) + doc.metadata["source_id"] = str(source_id) store.add_texts([doc.page_content], metadatas=[doc.metadata]) except Exception as e: