Merge branch 'main' into feat/agent-menu

2026-03-05 13:23:46 +00:00 · 2025-04-16 18:35:38 +05:30
parent e6ed37139a 5e1aaf5a44
commit da15cde49c
21 changed files with 1568 additions and 231 deletions
--- a/application/agents/classic_agent.py
+++ b/application/agents/classic_agent.py
@@ -48,10 +48,13 @@ class ClassicAgent(BaseAgent):
        ):
            yield {"answer": resp.message.content}
        else:
-            completion = self.llm.gen_stream(
-                model=self.gpt_model, messages=messages, tools=self.tools
-            )
-            for line in completion:
+            # completion = self.llm.gen_stream(
+            #     model=self.gpt_model, messages=messages, tools=self.tools
+            # )
+            # log type of resp
+            logger.info(f"Response type: {type(resp)}")
+            logger.info(f"Response: {resp}")
+            for line in resp:
                if isinstance(line, str):
                    yield {"answer": line}

--- a/application/agents/llm_handler.py
+++ b/application/agents/llm_handler.py
@@ -15,7 +15,7 @@ class LLMHandler(ABC):
    @abstractmethod
    def handle_response(self, agent, resp, tools_dict, messages, attachments=None, **kwargs):
        pass
-        
+    
    def prepare_messages_with_attachments(self, agent, messages, attachments=None):
        """
        Prepare messages with attachment content if available.
@@ -33,15 +33,53 @@ class LLMHandler(ABC):
        
        logger.info(f"Preparing messages with {len(attachments)} attachments")
        
-        # Check if the LLM has its own custom attachment handling implementation
-        if hasattr(agent.llm, "prepare_messages_with_attachments") and agent.llm.__class__.__name__ != "BaseLLM":
-            logger.info(f"Using {agent.llm.__class__.__name__}'s own prepare_messages_with_attachments method")
-            return agent.llm.prepare_messages_with_attachments(messages, attachments)
+        supported_types = agent.llm.get_supported_attachment_types()
        
-        # Otherwise, append attachment content to the system prompt
+        supported_attachments = []
+        unsupported_attachments = []
+        
+        for attachment in attachments:
+            mime_type = attachment.get('mime_type')
+            if not mime_type:
+                import mimetypes
+                file_path = attachment.get('path')
+                if file_path:
+                    mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+                else:
+                    unsupported_attachments.append(attachment)
+                    continue
+            
+            if mime_type in supported_types:
+                supported_attachments.append(attachment)
+            else:
+                unsupported_attachments.append(attachment)
+        
+        # Process supported attachments with the LLM's custom method
+        prepared_messages = messages
+        if supported_attachments:
+            logger.info(f"Processing {len(supported_attachments)} supported attachments with {agent.llm.__class__.__name__}'s method")
+            prepared_messages = agent.llm.prepare_messages_with_attachments(messages, supported_attachments)
+        
+        # Process unsupported attachments with the default method
+        if unsupported_attachments:
+            logger.info(f"Processing {len(unsupported_attachments)} unsupported attachments with default method")
+            prepared_messages = self._append_attachment_content_to_system(prepared_messages, unsupported_attachments)
+            
+        return prepared_messages
+    
+    def _append_attachment_content_to_system(self, messages, attachments):
+        """
+        Default method to append attachment content to the system prompt.
+        
+        Args:
+            messages (list): List of message dictionaries.
+            attachments (list): List of attachment dictionaries with content.
+            
+        Returns:
+            list: Messages with attachment context added to the system prompt.
+        """
        prepared_messages = messages.copy()
        
-        # Build attachment content string
        attachment_texts = []
        for attachment in attachments:
            logger.info(f"Adding attachment {attachment.get('id')} to context")
@@ -122,12 +160,13 @@ class OpenAILLMHandler(LLMHandler):
            return resp

        else:
-            
+            text_buffer = ""
            while True:
                tool_calls = {}
                for chunk in resp:
                    if isinstance(chunk, str) and len(chunk) > 0:
-                        return
+                        yield chunk
+                        continue
                    elif hasattr(chunk, "delta"): 
                        chunk_delta = chunk.delta

@@ -206,12 +245,17 @@ class OpenAILLMHandler(LLMHandler):
                                        }
                                    )
                            tool_calls = {}
+                        if hasattr(chunk_delta, "content") and chunk_delta.content:
+                            # Add to buffer or yield immediately based on your preference
+                            text_buffer += chunk_delta.content
+                            yield text_buffer
+                            text_buffer = ""

                        if (
                            hasattr(chunk, "finish_reason")
                            and chunk.finish_reason == "stop"
                        ):
-                            return
+                            return resp
                    elif isinstance(chunk, str) and len(chunk) == 0:
                            continue
                
@@ -227,7 +271,7 @@ class GoogleLLMHandler(LLMHandler):
        from google.genai import types
        
        messages = self.prepare_messages_with_attachments(agent, messages, attachments)
-
+        
        while True:
            if not stream:
                response = agent.llm.gen(
@@ -298,6 +342,9 @@ class GoogleLLMHandler(LLMHandler):
                                "content": [function_response_part.to_json_dict()],
                            }
                        )
+                    else:
+                        tool_call_found = False
+                        yield result

                if not tool_call_found:
                    return response
--- a/application/api/answer/routes.py
+++ b/application/api/answer/routes.py
@@ -657,6 +657,7 @@ class Answer(Resource):
            source_log_docs = []
            tool_calls = []
            stream_ended = False
+            thought = ""

            for line in complete_stream(
                question=question,
@@ -679,6 +680,8 @@ class Answer(Resource):
                        source_log_docs = event["source"]
                    elif event["type"] == "tool_calls":
                        tool_calls = event["tool_calls"]
+                    elif event["type"] == "thought":
+                        thought = event["thought"]
                    elif event["type"] == "error":
                        logger.error(f"Error from stream: {event['error']}")
                        return bad_request(500, event["error"])
@@ -710,6 +713,7 @@ class Answer(Resource):
                    conversation_id,
                    question,
                    response_full,
+                    thought,
                    source_log_docs,
                    tool_calls,
                    llm,
@@ -876,14 +880,7 @@ def get_attachments_content(attachment_ids, user):
            )

            if attachment_doc:
-                attachments.append(
-                    {
-                        "id": str(attachment_doc["_id"]),
-                        "content": attachment_doc["content"],
-                        "token_count": attachment_doc.get("token_count", 0),
-                        "path": attachment_doc.get("path", ""),
-                    }
-                )
+                attachments.append(attachment_doc)
        except Exception as e:
            logger.error(f"Error retrieving attachment {attachment_id}: {e}")

--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -2792,25 +2792,25 @@ class StoreAttachment(Resource):
        user = secure_filename(decoded_token.get("sub"))

        try:
+            attachment_id = ObjectId()
            original_filename = secure_filename(file.filename)
-            folder_name = original_filename
+
            save_dir = os.path.join(
-                current_dir, settings.UPLOAD_FOLDER, user, "attachments", folder_name
+                current_dir,
+                settings.UPLOAD_FOLDER,
+                user,
+                "attachments",
+                str(attachment_id),
            )
            os.makedirs(save_dir, exist_ok=True)
-            # Create directory structure: user/attachments/filename/
+
            file_path = os.path.join(save_dir, original_filename)

-            # Handle filename conflicts
-            if os.path.exists(file_path):
-                name_parts = os.path.splitext(original_filename)
-                timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-                new_filename = f"{name_parts[0]}_{timestamp}{name_parts[1]}"
-                file_path = os.path.join(save_dir, new_filename)
-                original_filename = new_filename
-
            file.save(file_path)
-            file_info = {"folder": folder_name, "filename": original_filename}
+            file_info = {
+                "filename": original_filename,
+                "attachment_id": str(attachment_id),
+            }
            current_app.logger.info(f"Saved file: {file_path}")

            # Start async task to process single file
--- a/application/llm/base.py
+++ b/application/llm/base.py
@@ -55,3 +55,12 @@ class BaseLLM(ABC):

    def _supports_tools(self):
        raise NotImplementedError("Subclass must implement _supports_tools method")
+        
+    def get_supported_attachment_types(self):
+        """
+        Return a list of MIME types supported by this LLM for file uploads.
+        
+        Returns:
+            list: List of supported MIME types
+        """
+        return []  # Default: no attachments supported
--- a/application/llm/google_ai.py
+++ b/application/llm/google_ai.py
@@ -1,5 +1,9 @@
 from google import genai
 from google.genai import types
+import os
+import logging
+import mimetypes
+import json

 from application.llm.base import BaseLLM

@@ -9,6 +13,138 @@ class GoogleLLM(BaseLLM):
        super().__init__(*args, **kwargs)
        self.api_key = api_key
        self.user_api_key = user_api_key
+        self.client = genai.Client(api_key=self.api_key)
+
+    def get_supported_attachment_types(self):
+        """
+        Return a list of MIME types supported by Google Gemini for file uploads.
+        
+        Returns:
+            list: List of supported MIME types
+        """
+        return [
+            'application/pdf',
+            'image/png',
+            'image/jpeg',
+            'image/jpg',
+            'image/webp',
+            'image/gif'
+        ]
+    
+    def prepare_messages_with_attachments(self, messages, attachments=None):
+        """
+        Process attachments using Google AI's file API for more efficient handling.
+        
+        Args:
+            messages (list): List of message dictionaries.
+            attachments (list): List of attachment dictionaries with content and metadata.
+            
+        Returns:
+            list: Messages formatted with file references for Google AI API.
+        """
+        if not attachments:
+            return messages
+        
+        prepared_messages = messages.copy()
+        
+        # Find the user message to attach files to the last one
+        user_message_index = None
+        for i in range(len(prepared_messages) - 1, -1, -1):
+            if prepared_messages[i].get("role") == "user":
+                user_message_index = i
+                break
+        
+        if user_message_index is None:
+            user_message = {"role": "user", "content": []}
+            prepared_messages.append(user_message)
+            user_message_index = len(prepared_messages) - 1
+        
+        if isinstance(prepared_messages[user_message_index].get("content"), str):
+            text_content = prepared_messages[user_message_index]["content"]
+            prepared_messages[user_message_index]["content"] = [
+                {"type": "text", "text": text_content}
+            ]
+        elif not isinstance(prepared_messages[user_message_index].get("content"), list):
+            prepared_messages[user_message_index]["content"] = []
+        
+        files = []
+        for attachment in attachments:
+            mime_type = attachment.get('mime_type')
+            if not mime_type:
+                file_path = attachment.get('path')
+                if file_path:
+                    mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+            
+            if mime_type in self.get_supported_attachment_types():
+                try:
+                    file_uri = self._upload_file_to_google(attachment)
+                    logging.info(f"GoogleLLM: Successfully uploaded file, got URI: {file_uri}")
+                    files.append({"file_uri": file_uri, "mime_type": mime_type})
+                except Exception as e:
+                    logging.error(f"GoogleLLM: Error uploading file: {e}")
+                    if 'content' in attachment:
+                        prepared_messages[user_message_index]["content"].append({
+                            "type": "text", 
+                            "text": f"[File could not be processed: {attachment.get('path', 'unknown')}]"
+                        })
+        
+        if files:
+            logging.info(f"GoogleLLM: Adding {len(files)} files to message")
+            prepared_messages[user_message_index]["content"].append({
+                "files": files
+            })
+        
+        return prepared_messages
+
+    def _upload_file_to_google(self, attachment):
+        """
+        Upload a file to Google AI and return the file URI.
+        
+        Args:
+            attachment (dict): Attachment dictionary with path and metadata.
+                
+        Returns:
+            str: Google AI file URI for the uploaded file.
+        """
+        if 'google_file_uri' in attachment:
+            return attachment['google_file_uri']
+        
+        file_path = attachment.get('path')
+        if not file_path:
+            raise ValueError("No file path provided in attachment")
+        
+        if not os.path.isabs(file_path):
+            current_dir = os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            )
+            file_path = os.path.join(current_dir, "application", file_path)
+        
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        mime_type = attachment.get('mime_type')
+        if not mime_type:
+            mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+        
+        try:
+            response = self.client.files.upload(file=file_path)
+            
+            file_uri = response.uri
+            
+            from application.core.mongo_db import MongoDB
+            mongo = MongoDB.get_client()
+            db = mongo["docsgpt"]
+            attachments_collection = db["attachments"]
+            if '_id' in attachment:
+                attachments_collection.update_one(
+                    {"_id": attachment['_id']},
+                    {"$set": {"google_file_uri": file_uri}}
+                )
+            
+            return file_uri
+        except Exception as e:
+            logging.error(f"Error uploading file to Google AI: {e}")
+            raise

    def _clean_messages_google(self, messages):
        cleaned_messages = []
@@ -26,7 +162,7 @@ class GoogleLLM(BaseLLM):
                elif isinstance(content, list):
                    for item in content:
                        if "text" in item:
-                            parts.append(types.Part.from_text(item["text"]))
+                            parts.append(types.Part.from_text(text=item["text"]))
                        elif "function_call" in item:
                            parts.append(
                                types.Part.from_function_call(
@@ -41,6 +177,14 @@ class GoogleLLM(BaseLLM):
                                    response=item["function_response"]["response"],
                                )
                            )
+                        elif "files" in item:
+                                for file_data in item["files"]:
+                                    parts.append(
+                                        types.Part.from_uri(
+                                            file_uri=file_data["file_uri"],
+                                            mime_type=file_data["mime_type"]
+                                        )
+                                    )
                        else:
                            raise ValueError(
                                f"Unexpected content dictionary format:{item}"
@@ -145,12 +289,26 @@ class GoogleLLM(BaseLLM):
        if tools:
            cleaned_tools = self._clean_tools_format(tools)
            config.tools = cleaned_tools
-
+        
+        # Check if we have both tools and file attachments
+        has_attachments = False
+        for message in messages:
+            for part in message.parts:
+                if hasattr(part, 'file_data') and part.file_data is not None:
+                    has_attachments = True
+                    break
+            if has_attachments:
+                break
+        
+        logging.info(f"GoogleLLM: Starting stream generation. Model: {model}, Messages: {json.dumps(messages, default=str)}, Has attachments: {has_attachments}")
+        
        response = client.models.generate_content_stream(
            model=model,
            contents=messages,
            config=config,
        )
+        
+        
        for chunk in response:
            if hasattr(chunk, "candidates") and chunk.candidates:
                for candidate in chunk.candidates:
--- a/application/llm/openai.py
+++ b/application/llm/openai.py
@@ -1,4 +1,8 @@
 import json
+import base64
+import os
+import mimetypes
+import logging

 from application.core.settings import settings
 from application.llm.base import BaseLLM
@@ -65,6 +69,15 @@ class OpenAILLM(BaseLLM):
                                    ),
                                }
                            )
+                        elif isinstance(item, dict):
+                            content_parts = []
+                            if "text" in item:
+                                content_parts.append({"type": "text", "text": item["text"]})
+                            elif "type" in item and item["type"] == "text" and "text" in item:
+                                content_parts.append(item)
+                            elif "type" in item and item["type"] == "file" and "file" in item:
+                                content_parts.append(item)
+                            cleaned_messages.append({"role": role, "content": content_parts})
                        else:
                            raise ValueError(
                                f"Unexpected content dictionary format: {item}"
@@ -133,6 +146,183 @@ class OpenAILLM(BaseLLM):
    def _supports_tools(self):
        return True

+    def get_supported_attachment_types(self):
+        """
+        Return a list of MIME types supported by OpenAI for file uploads.
+        
+        Returns:
+            list: List of supported MIME types
+        """
+        return [
+            'application/pdf',
+            'image/png',
+            'image/jpeg',
+            'image/jpg',
+            'image/webp',
+            'image/gif'
+        ]
+    
+    def prepare_messages_with_attachments(self, messages, attachments=None):
+        """
+        Process attachments using OpenAI's file API for more efficient handling.
+        
+        Args:
+            messages (list): List of message dictionaries.
+            attachments (list): List of attachment dictionaries with content and metadata.
+            
+        Returns:
+            list: Messages formatted with file references for OpenAI API.
+        """
+        if not attachments:
+            return messages
+        
+        prepared_messages = messages.copy()
+        
+        # Find the user message to attach file_id to the last one
+        user_message_index = None
+        for i in range(len(prepared_messages) - 1, -1, -1):
+            if prepared_messages[i].get("role") == "user":
+                user_message_index = i
+                break
+        
+        if user_message_index is None:
+            user_message = {"role": "user", "content": []}
+            prepared_messages.append(user_message)
+            user_message_index = len(prepared_messages) - 1
+        
+        if isinstance(prepared_messages[user_message_index].get("content"), str):
+            text_content = prepared_messages[user_message_index]["content"]
+            prepared_messages[user_message_index]["content"] = [
+                {"type": "text", "text": text_content}
+            ]
+        elif not isinstance(prepared_messages[user_message_index].get("content"), list):
+            prepared_messages[user_message_index]["content"] = []
+        
+        for attachment in attachments:
+            mime_type = attachment.get('mime_type')
+            if not mime_type:
+                file_path = attachment.get('path')
+                if file_path:
+                    mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+            
+            if mime_type and mime_type.startswith('image/'):
+                try:
+                    base64_image = self._get_base64_image(attachment)
+                    prepared_messages[user_message_index]["content"].append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{mime_type};base64,{base64_image}"
+                        }
+                    })
+                except Exception as e:
+                    logging.error(f"Error processing image attachment: {e}")
+                    if 'content' in attachment:
+                        prepared_messages[user_message_index]["content"].append({
+                            "type": "text", 
+                            "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]"
+                        })
+            # Handle PDFs using the file API
+            elif mime_type == 'application/pdf':
+                try:
+                    file_id = self._upload_file_to_openai(attachment)
+                    
+                    prepared_messages[user_message_index]["content"].append({
+                        "type": "file",
+                        "file": {"file_id": file_id}
+                    })
+                except Exception as e:
+                    logging.error(f"Error uploading PDF to OpenAI: {e}")
+                    if 'content' in attachment:
+                        prepared_messages[user_message_index]["content"].append({
+                            "type": "text", 
+                            "text": f"File content:\n\n{attachment['content']}"
+                        })
+        
+        return prepared_messages
+
+    def _get_base64_image(self, attachment):
+        """
+        Convert an image file to base64 encoding.
+        
+        Args:
+            attachment (dict): Attachment dictionary with path and metadata.
+            
+        Returns:
+            str: Base64-encoded image data.
+        """
+        file_path = attachment.get('path')
+        if not file_path:
+            raise ValueError("No file path provided in attachment")
+        
+        if not os.path.isabs(file_path):
+            current_dir = os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            )
+            file_path = os.path.join(current_dir, "application", file_path)
+        
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        with open(file_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def _upload_file_to_openai(self, attachment): ##pdfs
+        """
+        Upload a file to OpenAI and return the file_id.
+        
+        Args:
+            attachment (dict): Attachment dictionary with path and metadata.
+                Expected keys:
+                - path: Path to the file
+                - id: Optional MongoDB ID for caching
+                
+        Returns:
+            str: OpenAI file_id for the uploaded file.
+        """
+        import os
+        import logging
+        
+        if 'openai_file_id' in attachment:
+            return attachment['openai_file_id']
+        
+        file_path = attachment.get('path')
+        if not file_path:
+            raise ValueError("No file path provided in attachment")
+        
+        if not os.path.isabs(file_path):
+            current_dir = os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            )
+            file_path = os.path.join(current_dir,"application", file_path)
+        
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        
+        try:
+            with open(file_path, 'rb') as file:
+                response = self.client.files.create(
+                    file=file,
+                    purpose="assistants"
+                )
+            
+            file_id = response.id
+            
+            from application.core.mongo_db import MongoDB
+            mongo = MongoDB.get_client()
+            db = mongo["docsgpt"]
+            attachments_collection = db["attachments"]
+            if '_id' in attachment:
+                attachments_collection.update_one(
+                    {"_id": attachment['_id']},
+                    {"$set": {"openai_file_id": file_id}}
+                )
+            
+            return file_id
+        except Exception as e:
+            logging.error(f"Error uploading file to OpenAI: {e}")
+            raise
+

 class AzureOpenAILLM(OpenAILLM):

--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the separators, table head, etc. on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """

    def __init__(
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix

    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
            raise ValueError("pandas module is required to read CSV files.")

        df = pd.read_csv(file, **self._pandas_config)
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"

-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)
+        
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)

-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        return self._row_joiner.join(text_list)


 class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
            for more information.
            Set to empty dict by default, this means pandas will try to figure
            out the table structure on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning (default)
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
    """

    def __init__(
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
            col_joiner: str = ", ",
            row_joiner: str = "\n",
            pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
            **kwargs: Any
    ) -> None:
        """Init params."""
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix

    def _init_parser(self) -> Dict:
        """Init parser."""
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
            raise ValueError("pandas module is required to read Excel files.")

        df = pd.read_excel(file, **self._pandas_config)
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
+        
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)

-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)
+        return self._row_joiner.join(text_list)
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -41,7 +41,7 @@ lxml==5.3.1
 markupsafe==3.0.2
 marshmallow==3.26.1
 mpmath==1.3.0
-multidict==6.1.0
+multidict==6.3.2
 mypy-extensions==1.0.0
 networkx==3.4.2
 numpy==2.2.1
--- a/application/worker.py
+++ b/application/worker.py
@@ -328,34 +328,30 @@ def attachment_worker(self, directory, file_info, user):
    """
    import datetime
    import os
+    import mimetypes
    from application.utils import num_tokens_from_string
    
    mongo = MongoDB.get_client()
    db = mongo["docsgpt"]
    attachments_collection = db["attachments"]
    
-    job_name = file_info["folder"]
-    logging.info(f"Processing attachment: {job_name}", extra={"user": user, "job": job_name})
+    filename = file_info["filename"]
+    attachment_id = file_info["attachment_id"]
+    
+    logging.info(f"Processing attachment: {attachment_id}/{filename}", extra={"user": user})
    
    self.update_state(state="PROGRESS", meta={"current": 10})
    
-    folder_name = file_info["folder"]
-    filename = file_info["filename"]
-    
    file_path = os.path.join(directory, filename)
    
-    
-    logging.info(f"Processing file: {file_path}", extra={"user": user, "job": job_name})
-    
    if not os.path.exists(file_path):
-        logging.warning(f"File not found: {file_path}", extra={"user": user, "job": job_name})
-        return {"error": "File not found"}
+        logging.warning(f"File not found: {file_path}", extra={"user": user})
+        raise FileNotFoundError(f"File not found: {file_path}")
    
    try:
        reader = SimpleDirectoryReader(
            input_files=[file_path]
        )
-        
        documents = reader.load_data()
        
        self.update_state(state="PROGRESS", meta={"current": 50})
@@ -364,33 +360,37 @@ def attachment_worker(self, directory, file_info, user):
            content = documents[0].text
            token_count = num_tokens_from_string(content)
            
-            file_path_relative = f"{user}/attachments/{folder_name}/{filename}"
+            file_path_relative = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{attachment_id}/{filename}"
            
-            attachment_id = attachments_collection.insert_one({
+            mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+            
+            doc_id = ObjectId(attachment_id)
+            attachments_collection.insert_one({
+                "_id": doc_id,
                "user": user,
                "path": file_path_relative,
                "content": content,
                "token_count": token_count,
+                "mime_type": mime_type,
                "date": datetime.datetime.now(),
-            }).inserted_id
+            })
            
            logging.info(f"Stored attachment with ID: {attachment_id}", 
-                        extra={"user": user, "job": job_name})
+                        extra={"user": user})
            
            self.update_state(state="PROGRESS", meta={"current": 100})
            
            return {
-                "attachment_id": str(attachment_id),
                "filename": filename,
-                "folder": folder_name,
                "path": file_path_relative,
-                "token_count": token_count
+                "token_count": token_count,
+                "attachment_id": attachment_id,
+                "mime_type": mime_type
            }
        else:
            logging.warning("No content was extracted from the file", 
-                           extra={"user": user, "job": job_name})
-            return {"error": "No content was extracted from the file"}
+                           extra={"user": user})
+            raise ValueError("No content was extracted from the file")
    except Exception as e:
-        logging.error(f"Error processing file {filename}: {e}", 
-                     extra={"user": user, "job": job_name}, exc_info=True)
-        return {"error": f"Error processing file: {str(e)}"}
+        logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True)
+        raise