diff --git a/application/core/settings.py b/application/core/settings.py index 1adf8351..91144ae9 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -118,6 +118,7 @@ class Settings(BaseSettings): # Encryption settings ENCRYPTION_SECRET_KEY: str = "default-docsgpt-encryption-key" + ELEVENLABS_API_KEY: Optional[str] = None path = Path(__file__).parent.parent.absolute() settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8") diff --git a/application/tts/elevenlabs.py b/application/tts/elevenlabs.py index 2e8159b8..0d82021e 100644 --- a/application/tts/elevenlabs.py +++ b/application/tts/elevenlabs.py @@ -1,84 +1,30 @@ -import asyncio -import websockets -import json -import base64 from io import BytesIO +import base64 from application.tts.base import BaseTTS +from application.core.settings import settings class ElevenlabsTTS(BaseTTS): - def __init__(self): - self.api_key = 'ELEVENLABS_API_KEY'# here you should put your api key - self.model = "eleven_flash_v2_5" - self.voice = "VOICE_ID" # this is the hash code for the voice not the name! - self.write_audio = 1 + def __init__(self): + from elevenlabs.client import ElevenLabs + + self.client = ElevenLabs( + api_key=settings.ELEVENLABS_API_KEY, + ) + def text_to_speech(self, text): - asyncio.run(self._text_to_speech_websocket(text)) + lang = "en" + audio = self.client.generate( + text=text, + model="eleven_multilingual_v2", + voice="Brian", + ) + audio_data = BytesIO() + for chunk in audio: + audio_data.write(chunk) + audio_bytes = audio_data.getvalue() - async def _text_to_speech_websocket(self, text): - uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{self.voice}/stream-input?model_id={self.model}" - websocket = await websockets.connect(uri) - payload = { - "text": " ", - "voice_settings": { - "stability": 0.5, - "similarity_boost": 0.8, - }, - "xi_api_key": self.api_key, - } - - await websocket.send(json.dumps(payload)) - - async def listen(): - while 1: - try: - msg = await websocket.recv() - data = json.loads(msg) - - if data.get("audio"): - print("audio received") - yield base64.b64decode(data["audio"]) - elif data.get("isFinal"): - break - except websockets.exceptions.ConnectionClosed: - print("websocket closed") - break - listen_task = asyncio.create_task(self.stream(listen())) - - await websocket.send(json.dumps({"text": text})) - # this is to signal the end of the text, either use this or flush - await websocket.send(json.dumps({"text": ""})) - - await listen_task - - async def stream(self, audio_stream): - if self.write_audio: - audio_bytes = BytesIO() - async for chunk in audio_stream: - if chunk: - audio_bytes.write(chunk) - with open("output_audio.mp3", "wb") as f: - f.write(audio_bytes.getvalue()) - - else: - async for chunk in audio_stream: - pass # depends on the streamer! - - -def test_elevenlabs_websocket(): - """ - Tests the ElevenlabsTTS text_to_speech method with a sample prompt. - Prints out the base64-encoded result and writes it to 'output_audio.mp3'. - """ - # Instantiate your TTS class - tts = ElevenlabsTTS() - - # Call the method with some sample text - tts.text_to_speech("Hello from ElevenLabs WebSocket!") - - print("Saved audio to output_audio.mp3.") - - -if __name__ == "__main__": - test_elevenlabs_websocket() + # Encode to base64 + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") + return audio_base64, lang \ No newline at end of file