From 545caacfa34e519f3fcda0f435d03b8b34a1d831 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 26 Aug 2025 23:30:57 +0100 Subject: [PATCH] feat: prevent NUL character ingestion failures --- application/parser/embedding_pipeline.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/application/parser/embedding_pipeline.py b/application/parser/embedding_pipeline.py index 38492c7c..7511f3df 100755 --- a/application/parser/embedding_pipeline.py +++ b/application/parser/embedding_pipeline.py @@ -6,6 +6,21 @@ from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator +def sanitize_content(content: str) -> str: + """ + Remove NUL characters that can cause vector store ingestion to fail. + + Args: + content (str): Raw content that may contain NUL characters + + Returns: + str: Sanitized content with NUL characters removed + """ + if not content: + return content + return content.replace('\x00', '') + + @retry(tries=10, delay=60) def add_text_to_store_with_retry(store, doc, source_id): """ @@ -16,6 +31,9 @@ def add_text_to_store_with_retry(store, doc, source_id): source_id: Unique identifier for the source. """ try: + # Sanitize content to remove NUL characters that cause ingestion failures + doc.page_content = sanitize_content(doc.page_content) + doc.metadata["source_id"] = str(source_id) store.add_texts([doc.page_content], metadatas=[doc.metadata]) except Exception as e: