mirror of
https://github.com/arc53/DocsGPT.git
synced 2026-02-21 20:01:26 +00:00
Merge branch 'main' into feat/agent-menu
This commit is contained in:
@@ -48,10 +48,13 @@ class ClassicAgent(BaseAgent):
|
||||
):
|
||||
yield {"answer": resp.message.content}
|
||||
else:
|
||||
completion = self.llm.gen_stream(
|
||||
model=self.gpt_model, messages=messages, tools=self.tools
|
||||
)
|
||||
for line in completion:
|
||||
# completion = self.llm.gen_stream(
|
||||
# model=self.gpt_model, messages=messages, tools=self.tools
|
||||
# )
|
||||
# log type of resp
|
||||
logger.info(f"Response type: {type(resp)}")
|
||||
logger.info(f"Response: {resp}")
|
||||
for line in resp:
|
||||
if isinstance(line, str):
|
||||
yield {"answer": line}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ class LLMHandler(ABC):
|
||||
@abstractmethod
|
||||
def handle_response(self, agent, resp, tools_dict, messages, attachments=None, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def prepare_messages_with_attachments(self, agent, messages, attachments=None):
|
||||
"""
|
||||
Prepare messages with attachment content if available.
|
||||
@@ -33,15 +33,53 @@ class LLMHandler(ABC):
|
||||
|
||||
logger.info(f"Preparing messages with {len(attachments)} attachments")
|
||||
|
||||
# Check if the LLM has its own custom attachment handling implementation
|
||||
if hasattr(agent.llm, "prepare_messages_with_attachments") and agent.llm.__class__.__name__ != "BaseLLM":
|
||||
logger.info(f"Using {agent.llm.__class__.__name__}'s own prepare_messages_with_attachments method")
|
||||
return agent.llm.prepare_messages_with_attachments(messages, attachments)
|
||||
supported_types = agent.llm.get_supported_attachment_types()
|
||||
|
||||
# Otherwise, append attachment content to the system prompt
|
||||
supported_attachments = []
|
||||
unsupported_attachments = []
|
||||
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get('mime_type')
|
||||
if not mime_type:
|
||||
import mimetypes
|
||||
file_path = attachment.get('path')
|
||||
if file_path:
|
||||
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
|
||||
else:
|
||||
unsupported_attachments.append(attachment)
|
||||
continue
|
||||
|
||||
if mime_type in supported_types:
|
||||
supported_attachments.append(attachment)
|
||||
else:
|
||||
unsupported_attachments.append(attachment)
|
||||
|
||||
# Process supported attachments with the LLM's custom method
|
||||
prepared_messages = messages
|
||||
if supported_attachments:
|
||||
logger.info(f"Processing {len(supported_attachments)} supported attachments with {agent.llm.__class__.__name__}'s method")
|
||||
prepared_messages = agent.llm.prepare_messages_with_attachments(messages, supported_attachments)
|
||||
|
||||
# Process unsupported attachments with the default method
|
||||
if unsupported_attachments:
|
||||
logger.info(f"Processing {len(unsupported_attachments)} unsupported attachments with default method")
|
||||
prepared_messages = self._append_attachment_content_to_system(prepared_messages, unsupported_attachments)
|
||||
|
||||
return prepared_messages
|
||||
|
||||
def _append_attachment_content_to_system(self, messages, attachments):
|
||||
"""
|
||||
Default method to append attachment content to the system prompt.
|
||||
|
||||
Args:
|
||||
messages (list): List of message dictionaries.
|
||||
attachments (list): List of attachment dictionaries with content.
|
||||
|
||||
Returns:
|
||||
list: Messages with attachment context added to the system prompt.
|
||||
"""
|
||||
prepared_messages = messages.copy()
|
||||
|
||||
# Build attachment content string
|
||||
attachment_texts = []
|
||||
for attachment in attachments:
|
||||
logger.info(f"Adding attachment {attachment.get('id')} to context")
|
||||
@@ -122,12 +160,13 @@ class OpenAILLMHandler(LLMHandler):
|
||||
return resp
|
||||
|
||||
else:
|
||||
|
||||
text_buffer = ""
|
||||
while True:
|
||||
tool_calls = {}
|
||||
for chunk in resp:
|
||||
if isinstance(chunk, str) and len(chunk) > 0:
|
||||
return
|
||||
yield chunk
|
||||
continue
|
||||
elif hasattr(chunk, "delta"):
|
||||
chunk_delta = chunk.delta
|
||||
|
||||
@@ -206,12 +245,17 @@ class OpenAILLMHandler(LLMHandler):
|
||||
}
|
||||
)
|
||||
tool_calls = {}
|
||||
if hasattr(chunk_delta, "content") and chunk_delta.content:
|
||||
# Add to buffer or yield immediately based on your preference
|
||||
text_buffer += chunk_delta.content
|
||||
yield text_buffer
|
||||
text_buffer = ""
|
||||
|
||||
if (
|
||||
hasattr(chunk, "finish_reason")
|
||||
and chunk.finish_reason == "stop"
|
||||
):
|
||||
return
|
||||
return resp
|
||||
elif isinstance(chunk, str) and len(chunk) == 0:
|
||||
continue
|
||||
|
||||
@@ -227,7 +271,7 @@ class GoogleLLMHandler(LLMHandler):
|
||||
from google.genai import types
|
||||
|
||||
messages = self.prepare_messages_with_attachments(agent, messages, attachments)
|
||||
|
||||
|
||||
while True:
|
||||
if not stream:
|
||||
response = agent.llm.gen(
|
||||
@@ -298,6 +342,9 @@ class GoogleLLMHandler(LLMHandler):
|
||||
"content": [function_response_part.to_json_dict()],
|
||||
}
|
||||
)
|
||||
else:
|
||||
tool_call_found = False
|
||||
yield result
|
||||
|
||||
if not tool_call_found:
|
||||
return response
|
||||
|
||||
@@ -657,6 +657,7 @@ class Answer(Resource):
|
||||
source_log_docs = []
|
||||
tool_calls = []
|
||||
stream_ended = False
|
||||
thought = ""
|
||||
|
||||
for line in complete_stream(
|
||||
question=question,
|
||||
@@ -679,6 +680,8 @@ class Answer(Resource):
|
||||
source_log_docs = event["source"]
|
||||
elif event["type"] == "tool_calls":
|
||||
tool_calls = event["tool_calls"]
|
||||
elif event["type"] == "thought":
|
||||
thought = event["thought"]
|
||||
elif event["type"] == "error":
|
||||
logger.error(f"Error from stream: {event['error']}")
|
||||
return bad_request(500, event["error"])
|
||||
@@ -710,6 +713,7 @@ class Answer(Resource):
|
||||
conversation_id,
|
||||
question,
|
||||
response_full,
|
||||
thought,
|
||||
source_log_docs,
|
||||
tool_calls,
|
||||
llm,
|
||||
@@ -876,14 +880,7 @@ def get_attachments_content(attachment_ids, user):
|
||||
)
|
||||
|
||||
if attachment_doc:
|
||||
attachments.append(
|
||||
{
|
||||
"id": str(attachment_doc["_id"]),
|
||||
"content": attachment_doc["content"],
|
||||
"token_count": attachment_doc.get("token_count", 0),
|
||||
"path": attachment_doc.get("path", ""),
|
||||
}
|
||||
)
|
||||
attachments.append(attachment_doc)
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving attachment {attachment_id}: {e}")
|
||||
|
||||
|
||||
@@ -2792,25 +2792,25 @@ class StoreAttachment(Resource):
|
||||
user = secure_filename(decoded_token.get("sub"))
|
||||
|
||||
try:
|
||||
attachment_id = ObjectId()
|
||||
original_filename = secure_filename(file.filename)
|
||||
folder_name = original_filename
|
||||
|
||||
save_dir = os.path.join(
|
||||
current_dir, settings.UPLOAD_FOLDER, user, "attachments", folder_name
|
||||
current_dir,
|
||||
settings.UPLOAD_FOLDER,
|
||||
user,
|
||||
"attachments",
|
||||
str(attachment_id),
|
||||
)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
# Create directory structure: user/attachments/filename/
|
||||
|
||||
file_path = os.path.join(save_dir, original_filename)
|
||||
|
||||
# Handle filename conflicts
|
||||
if os.path.exists(file_path):
|
||||
name_parts = os.path.splitext(original_filename)
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
new_filename = f"{name_parts[0]}_{timestamp}{name_parts[1]}"
|
||||
file_path = os.path.join(save_dir, new_filename)
|
||||
original_filename = new_filename
|
||||
|
||||
file.save(file_path)
|
||||
file_info = {"folder": folder_name, "filename": original_filename}
|
||||
file_info = {
|
||||
"filename": original_filename,
|
||||
"attachment_id": str(attachment_id),
|
||||
}
|
||||
current_app.logger.info(f"Saved file: {file_path}")
|
||||
|
||||
# Start async task to process single file
|
||||
|
||||
@@ -55,3 +55,12 @@ class BaseLLM(ABC):
|
||||
|
||||
def _supports_tools(self):
|
||||
raise NotImplementedError("Subclass must implement _supports_tools method")
|
||||
|
||||
def get_supported_attachment_types(self):
|
||||
"""
|
||||
Return a list of MIME types supported by this LLM for file uploads.
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [] # Default: no attachments supported
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
import os
|
||||
import logging
|
||||
import mimetypes
|
||||
import json
|
||||
|
||||
from application.llm.base import BaseLLM
|
||||
|
||||
@@ -9,6 +13,138 @@ class GoogleLLM(BaseLLM):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.api_key = api_key
|
||||
self.user_api_key = user_api_key
|
||||
self.client = genai.Client(api_key=self.api_key)
|
||||
|
||||
def get_supported_attachment_types(self):
|
||||
"""
|
||||
Return a list of MIME types supported by Google Gemini for file uploads.
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
'application/pdf',
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/webp',
|
||||
'image/gif'
|
||||
]
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
Process attachments using Google AI's file API for more efficient handling.
|
||||
|
||||
Args:
|
||||
messages (list): List of message dictionaries.
|
||||
attachments (list): List of attachment dictionaries with content and metadata.
|
||||
|
||||
Returns:
|
||||
list: Messages formatted with file references for Google AI API.
|
||||
"""
|
||||
if not attachments:
|
||||
return messages
|
||||
|
||||
prepared_messages = messages.copy()
|
||||
|
||||
# Find the user message to attach files to the last one
|
||||
user_message_index = None
|
||||
for i in range(len(prepared_messages) - 1, -1, -1):
|
||||
if prepared_messages[i].get("role") == "user":
|
||||
user_message_index = i
|
||||
break
|
||||
|
||||
if user_message_index is None:
|
||||
user_message = {"role": "user", "content": []}
|
||||
prepared_messages.append(user_message)
|
||||
user_message_index = len(prepared_messages) - 1
|
||||
|
||||
if isinstance(prepared_messages[user_message_index].get("content"), str):
|
||||
text_content = prepared_messages[user_message_index]["content"]
|
||||
prepared_messages[user_message_index]["content"] = [
|
||||
{"type": "text", "text": text_content}
|
||||
]
|
||||
elif not isinstance(prepared_messages[user_message_index].get("content"), list):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
|
||||
files = []
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get('mime_type')
|
||||
if not mime_type:
|
||||
file_path = attachment.get('path')
|
||||
if file_path:
|
||||
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
|
||||
|
||||
if mime_type in self.get_supported_attachment_types():
|
||||
try:
|
||||
file_uri = self._upload_file_to_google(attachment)
|
||||
logging.info(f"GoogleLLM: Successfully uploaded file, got URI: {file_uri}")
|
||||
files.append({"file_uri": file_uri, "mime_type": mime_type})
|
||||
except Exception as e:
|
||||
logging.error(f"GoogleLLM: Error uploading file: {e}")
|
||||
if 'content' in attachment:
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"type": "text",
|
||||
"text": f"[File could not be processed: {attachment.get('path', 'unknown')}]"
|
||||
})
|
||||
|
||||
if files:
|
||||
logging.info(f"GoogleLLM: Adding {len(files)} files to message")
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"files": files
|
||||
})
|
||||
|
||||
return prepared_messages
|
||||
|
||||
def _upload_file_to_google(self, attachment):
|
||||
"""
|
||||
Upload a file to Google AI and return the file URI.
|
||||
|
||||
Args:
|
||||
attachment (dict): Attachment dictionary with path and metadata.
|
||||
|
||||
Returns:
|
||||
str: Google AI file URI for the uploaded file.
|
||||
"""
|
||||
if 'google_file_uri' in attachment:
|
||||
return attachment['google_file_uri']
|
||||
|
||||
file_path = attachment.get('path')
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in attachment")
|
||||
|
||||
if not os.path.isabs(file_path):
|
||||
current_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
file_path = os.path.join(current_dir, "application", file_path)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
mime_type = attachment.get('mime_type')
|
||||
if not mime_type:
|
||||
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
|
||||
|
||||
try:
|
||||
response = self.client.files.upload(file=file_path)
|
||||
|
||||
file_uri = response.uri
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
attachments_collection = db["attachments"]
|
||||
if '_id' in attachment:
|
||||
attachments_collection.update_one(
|
||||
{"_id": attachment['_id']},
|
||||
{"$set": {"google_file_uri": file_uri}}
|
||||
)
|
||||
|
||||
return file_uri
|
||||
except Exception as e:
|
||||
logging.error(f"Error uploading file to Google AI: {e}")
|
||||
raise
|
||||
|
||||
def _clean_messages_google(self, messages):
|
||||
cleaned_messages = []
|
||||
@@ -26,7 +162,7 @@ class GoogleLLM(BaseLLM):
|
||||
elif isinstance(content, list):
|
||||
for item in content:
|
||||
if "text" in item:
|
||||
parts.append(types.Part.from_text(item["text"]))
|
||||
parts.append(types.Part.from_text(text=item["text"]))
|
||||
elif "function_call" in item:
|
||||
parts.append(
|
||||
types.Part.from_function_call(
|
||||
@@ -41,6 +177,14 @@ class GoogleLLM(BaseLLM):
|
||||
response=item["function_response"]["response"],
|
||||
)
|
||||
)
|
||||
elif "files" in item:
|
||||
for file_data in item["files"]:
|
||||
parts.append(
|
||||
types.Part.from_uri(
|
||||
file_uri=file_data["file_uri"],
|
||||
mime_type=file_data["mime_type"]
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected content dictionary format:{item}"
|
||||
@@ -145,12 +289,26 @@ class GoogleLLM(BaseLLM):
|
||||
if tools:
|
||||
cleaned_tools = self._clean_tools_format(tools)
|
||||
config.tools = cleaned_tools
|
||||
|
||||
|
||||
# Check if we have both tools and file attachments
|
||||
has_attachments = False
|
||||
for message in messages:
|
||||
for part in message.parts:
|
||||
if hasattr(part, 'file_data') and part.file_data is not None:
|
||||
has_attachments = True
|
||||
break
|
||||
if has_attachments:
|
||||
break
|
||||
|
||||
logging.info(f"GoogleLLM: Starting stream generation. Model: {model}, Messages: {json.dumps(messages, default=str)}, Has attachments: {has_attachments}")
|
||||
|
||||
response = client.models.generate_content_stream(
|
||||
model=model,
|
||||
contents=messages,
|
||||
config=config,
|
||||
)
|
||||
|
||||
|
||||
for chunk in response:
|
||||
if hasattr(chunk, "candidates") and chunk.candidates:
|
||||
for candidate in chunk.candidates:
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
import json
|
||||
import base64
|
||||
import os
|
||||
import mimetypes
|
||||
import logging
|
||||
|
||||
from application.core.settings import settings
|
||||
from application.llm.base import BaseLLM
|
||||
@@ -65,6 +69,15 @@ class OpenAILLM(BaseLLM):
|
||||
),
|
||||
}
|
||||
)
|
||||
elif isinstance(item, dict):
|
||||
content_parts = []
|
||||
if "text" in item:
|
||||
content_parts.append({"type": "text", "text": item["text"]})
|
||||
elif "type" in item and item["type"] == "text" and "text" in item:
|
||||
content_parts.append(item)
|
||||
elif "type" in item and item["type"] == "file" and "file" in item:
|
||||
content_parts.append(item)
|
||||
cleaned_messages.append({"role": role, "content": content_parts})
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected content dictionary format: {item}"
|
||||
@@ -133,6 +146,183 @@ class OpenAILLM(BaseLLM):
|
||||
def _supports_tools(self):
|
||||
return True
|
||||
|
||||
def get_supported_attachment_types(self):
|
||||
"""
|
||||
Return a list of MIME types supported by OpenAI for file uploads.
|
||||
|
||||
Returns:
|
||||
list: List of supported MIME types
|
||||
"""
|
||||
return [
|
||||
'application/pdf',
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/jpg',
|
||||
'image/webp',
|
||||
'image/gif'
|
||||
]
|
||||
|
||||
def prepare_messages_with_attachments(self, messages, attachments=None):
|
||||
"""
|
||||
Process attachments using OpenAI's file API for more efficient handling.
|
||||
|
||||
Args:
|
||||
messages (list): List of message dictionaries.
|
||||
attachments (list): List of attachment dictionaries with content and metadata.
|
||||
|
||||
Returns:
|
||||
list: Messages formatted with file references for OpenAI API.
|
||||
"""
|
||||
if not attachments:
|
||||
return messages
|
||||
|
||||
prepared_messages = messages.copy()
|
||||
|
||||
# Find the user message to attach file_id to the last one
|
||||
user_message_index = None
|
||||
for i in range(len(prepared_messages) - 1, -1, -1):
|
||||
if prepared_messages[i].get("role") == "user":
|
||||
user_message_index = i
|
||||
break
|
||||
|
||||
if user_message_index is None:
|
||||
user_message = {"role": "user", "content": []}
|
||||
prepared_messages.append(user_message)
|
||||
user_message_index = len(prepared_messages) - 1
|
||||
|
||||
if isinstance(prepared_messages[user_message_index].get("content"), str):
|
||||
text_content = prepared_messages[user_message_index]["content"]
|
||||
prepared_messages[user_message_index]["content"] = [
|
||||
{"type": "text", "text": text_content}
|
||||
]
|
||||
elif not isinstance(prepared_messages[user_message_index].get("content"), list):
|
||||
prepared_messages[user_message_index]["content"] = []
|
||||
|
||||
for attachment in attachments:
|
||||
mime_type = attachment.get('mime_type')
|
||||
if not mime_type:
|
||||
file_path = attachment.get('path')
|
||||
if file_path:
|
||||
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
|
||||
|
||||
if mime_type and mime_type.startswith('image/'):
|
||||
try:
|
||||
base64_image = self._get_base64_image(attachment)
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{mime_type};base64,{base64_image}"
|
||||
}
|
||||
})
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing image attachment: {e}")
|
||||
if 'content' in attachment:
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"type": "text",
|
||||
"text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]"
|
||||
})
|
||||
# Handle PDFs using the file API
|
||||
elif mime_type == 'application/pdf':
|
||||
try:
|
||||
file_id = self._upload_file_to_openai(attachment)
|
||||
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"type": "file",
|
||||
"file": {"file_id": file_id}
|
||||
})
|
||||
except Exception as e:
|
||||
logging.error(f"Error uploading PDF to OpenAI: {e}")
|
||||
if 'content' in attachment:
|
||||
prepared_messages[user_message_index]["content"].append({
|
||||
"type": "text",
|
||||
"text": f"File content:\n\n{attachment['content']}"
|
||||
})
|
||||
|
||||
return prepared_messages
|
||||
|
||||
def _get_base64_image(self, attachment):
|
||||
"""
|
||||
Convert an image file to base64 encoding.
|
||||
|
||||
Args:
|
||||
attachment (dict): Attachment dictionary with path and metadata.
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded image data.
|
||||
"""
|
||||
file_path = attachment.get('path')
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in attachment")
|
||||
|
||||
if not os.path.isabs(file_path):
|
||||
current_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
file_path = os.path.join(current_dir, "application", file_path)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
with open(file_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
def _upload_file_to_openai(self, attachment): ##pdfs
|
||||
"""
|
||||
Upload a file to OpenAI and return the file_id.
|
||||
|
||||
Args:
|
||||
attachment (dict): Attachment dictionary with path and metadata.
|
||||
Expected keys:
|
||||
- path: Path to the file
|
||||
- id: Optional MongoDB ID for caching
|
||||
|
||||
Returns:
|
||||
str: OpenAI file_id for the uploaded file.
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
|
||||
if 'openai_file_id' in attachment:
|
||||
return attachment['openai_file_id']
|
||||
|
||||
file_path = attachment.get('path')
|
||||
if not file_path:
|
||||
raise ValueError("No file path provided in attachment")
|
||||
|
||||
if not os.path.isabs(file_path):
|
||||
current_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
file_path = os.path.join(current_dir,"application", file_path)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as file:
|
||||
response = self.client.files.create(
|
||||
file=file,
|
||||
purpose="assistants"
|
||||
)
|
||||
|
||||
file_id = response.id
|
||||
|
||||
from application.core.mongo_db import MongoDB
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
attachments_collection = db["attachments"]
|
||||
if '_id' in attachment:
|
||||
attachments_collection.update_one(
|
||||
{"_id": attachment['_id']},
|
||||
{"$set": {"openai_file_id": file_id}}
|
||||
)
|
||||
|
||||
return file_id
|
||||
except Exception as e:
|
||||
logging.error(f"Error uploading file to OpenAI: {e}")
|
||||
raise
|
||||
|
||||
|
||||
class AzureOpenAILLM(OpenAILLM):
|
||||
|
||||
|
||||
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
|
||||
for more information.
|
||||
Set to empty dict by default, this means pandas will try to figure
|
||||
out the separators, table head, etc. on its own.
|
||||
|
||||
|
||||
header_period (int): Controls how headers are included in output:
|
||||
- 0: Headers only at the beginning
|
||||
- 1: Headers in every row
|
||||
- N > 1: Headers every N rows
|
||||
|
||||
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -83,6 +89,8 @@ class PandasCSVParser(BaseParser):
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
header_period: int = 20,
|
||||
header_prefix: str = "HEADERS: ",
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
@@ -91,6 +99,8 @@ class PandasCSVParser(BaseParser):
|
||||
self._col_joiner = col_joiner
|
||||
self._row_joiner = row_joiner
|
||||
self._pandas_config = pandas_config
|
||||
self._header_period = header_period
|
||||
self._header_prefix = header_prefix
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
@@ -104,15 +114,26 @@ class PandasCSVParser(BaseParser):
|
||||
raise ValueError("pandas module is required to read CSV files.")
|
||||
|
||||
df = pd.read_csv(file, **self._pandas_config)
|
||||
headers = df.columns.tolist()
|
||||
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
|
||||
|
||||
text_list = df.apply(
|
||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||
).tolist()
|
||||
if not self._concat_rows:
|
||||
return df.apply(
|
||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||
).tolist()
|
||||
|
||||
text_list = []
|
||||
if self._header_period != 1:
|
||||
text_list.append(header_row)
|
||||
|
||||
for i, row in df.iterrows():
|
||||
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
|
||||
text_list.append(header_row)
|
||||
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
|
||||
if self._header_period == 1 and i < len(df) - 1:
|
||||
text_list.append(header_row)
|
||||
|
||||
if self._concat_rows:
|
||||
return (self._row_joiner).join(text_list)
|
||||
else:
|
||||
return text_list
|
||||
return self._row_joiner.join(text_list)
|
||||
|
||||
|
||||
class ExcelParser(BaseParser):
|
||||
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
|
||||
for more information.
|
||||
Set to empty dict by default, this means pandas will try to figure
|
||||
out the table structure on its own.
|
||||
|
||||
|
||||
header_period (int): Controls how headers are included in output:
|
||||
- 0: Headers only at the beginning (default)
|
||||
- 1: Headers in every row
|
||||
- N > 1: Headers every N rows
|
||||
|
||||
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -148,6 +175,8 @@ class ExcelParser(BaseParser):
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
header_period: int = 20,
|
||||
header_prefix: str = "HEADERS: ",
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
@@ -156,6 +185,8 @@ class ExcelParser(BaseParser):
|
||||
self._col_joiner = col_joiner
|
||||
self._row_joiner = row_joiner
|
||||
self._pandas_config = pandas_config
|
||||
self._header_period = header_period
|
||||
self._header_prefix = header_prefix
|
||||
|
||||
def _init_parser(self) -> Dict:
|
||||
"""Init parser."""
|
||||
@@ -169,12 +200,22 @@ class ExcelParser(BaseParser):
|
||||
raise ValueError("pandas module is required to read Excel files.")
|
||||
|
||||
df = pd.read_excel(file, **self._pandas_config)
|
||||
headers = df.columns.tolist()
|
||||
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
|
||||
|
||||
if not self._concat_rows:
|
||||
return df.apply(
|
||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||
).tolist()
|
||||
|
||||
text_list = []
|
||||
if self._header_period != 1:
|
||||
text_list.append(header_row)
|
||||
|
||||
text_list = df.apply(
|
||||
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
|
||||
).tolist()
|
||||
|
||||
if self._concat_rows:
|
||||
return (self._row_joiner).join(text_list)
|
||||
else:
|
||||
return text_list
|
||||
for i, row in df.iterrows():
|
||||
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
|
||||
text_list.append(header_row)
|
||||
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
|
||||
if self._header_period == 1 and i < len(df) - 1:
|
||||
text_list.append(header_row)
|
||||
return self._row_joiner.join(text_list)
|
||||
@@ -41,7 +41,7 @@ lxml==5.3.1
|
||||
markupsafe==3.0.2
|
||||
marshmallow==3.26.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.1.0
|
||||
multidict==6.3.2
|
||||
mypy-extensions==1.0.0
|
||||
networkx==3.4.2
|
||||
numpy==2.2.1
|
||||
|
||||
@@ -328,34 +328,30 @@ def attachment_worker(self, directory, file_info, user):
|
||||
"""
|
||||
import datetime
|
||||
import os
|
||||
import mimetypes
|
||||
from application.utils import num_tokens_from_string
|
||||
|
||||
mongo = MongoDB.get_client()
|
||||
db = mongo["docsgpt"]
|
||||
attachments_collection = db["attachments"]
|
||||
|
||||
job_name = file_info["folder"]
|
||||
logging.info(f"Processing attachment: {job_name}", extra={"user": user, "job": job_name})
|
||||
filename = file_info["filename"]
|
||||
attachment_id = file_info["attachment_id"]
|
||||
|
||||
logging.info(f"Processing attachment: {attachment_id}/{filename}", extra={"user": user})
|
||||
|
||||
self.update_state(state="PROGRESS", meta={"current": 10})
|
||||
|
||||
folder_name = file_info["folder"]
|
||||
filename = file_info["filename"]
|
||||
|
||||
file_path = os.path.join(directory, filename)
|
||||
|
||||
|
||||
logging.info(f"Processing file: {file_path}", extra={"user": user, "job": job_name})
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logging.warning(f"File not found: {file_path}", extra={"user": user, "job": job_name})
|
||||
return {"error": "File not found"}
|
||||
logging.warning(f"File not found: {file_path}", extra={"user": user})
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
try:
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path]
|
||||
)
|
||||
|
||||
documents = reader.load_data()
|
||||
|
||||
self.update_state(state="PROGRESS", meta={"current": 50})
|
||||
@@ -364,33 +360,37 @@ def attachment_worker(self, directory, file_info, user):
|
||||
content = documents[0].text
|
||||
token_count = num_tokens_from_string(content)
|
||||
|
||||
file_path_relative = f"{user}/attachments/{folder_name}/{filename}"
|
||||
file_path_relative = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{attachment_id}/{filename}"
|
||||
|
||||
attachment_id = attachments_collection.insert_one({
|
||||
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
|
||||
|
||||
doc_id = ObjectId(attachment_id)
|
||||
attachments_collection.insert_one({
|
||||
"_id": doc_id,
|
||||
"user": user,
|
||||
"path": file_path_relative,
|
||||
"content": content,
|
||||
"token_count": token_count,
|
||||
"mime_type": mime_type,
|
||||
"date": datetime.datetime.now(),
|
||||
}).inserted_id
|
||||
})
|
||||
|
||||
logging.info(f"Stored attachment with ID: {attachment_id}",
|
||||
extra={"user": user, "job": job_name})
|
||||
extra={"user": user})
|
||||
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
return {
|
||||
"attachment_id": str(attachment_id),
|
||||
"filename": filename,
|
||||
"folder": folder_name,
|
||||
"path": file_path_relative,
|
||||
"token_count": token_count
|
||||
"token_count": token_count,
|
||||
"attachment_id": attachment_id,
|
||||
"mime_type": mime_type
|
||||
}
|
||||
else:
|
||||
logging.warning("No content was extracted from the file",
|
||||
extra={"user": user, "job": job_name})
|
||||
return {"error": "No content was extracted from the file"}
|
||||
extra={"user": user})
|
||||
raise ValueError("No content was extracted from the file")
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing file {filename}: {e}",
|
||||
extra={"user": user, "job": job_name}, exc_info=True)
|
||||
return {"error": f"Error processing file: {str(e)}"}
|
||||
logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True)
|
||||
raise
|
||||
|
||||
Reference in New Issue
Block a user