diff --git a/application/tts/elevenlabs.py b/application/tts/elevenlabs.py index 96fb1f43..0d982d31 100644 --- a/application/tts/elevenlabs.py +++ b/application/tts/elevenlabs.py @@ -8,41 +8,62 @@ from base import BaseTTS class ElevenlabsTTS(BaseTTS): def __init__(self): - self.api_key = 'sk_19b72c883e8bdfcec2705be2d048f3830a40d2faa4b76b26' - self.model = "eleven_multilingual_v2" - self.voice = "Brian" + self.api_key = 'ELEVENLABS_API_KEY'# here you should put your api key + self.model = "eleven_flash_v2_5" + self.voice = "VOICE_ID" # this is the hash code for the voice not the name! + self.write_audio = 1 def text_to_speech(self, text): - audio_bytes = asyncio.run(self._text_to_speech_websocket(text)) - audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") - lang = "en" - return audio_base64, lang + asyncio.run(self._text_to_speech_websocket(text)) async def _text_to_speech_websocket(self, text): uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{self.voice}/stream-input?model_id={self.model}" - + websocket = await websockets.connect(uri) payload = { - "text": text, - "model_id": self.model, + "text": " ", "voice_settings": { - "voice_id": self.voice + "stability": 0.5, + "similarity_boost": 0.8, }, - "xi-api-key": self.api_key, - "Accept": "audio/mpeg" + "xi_api_key": self.api_key, } - audio_data = BytesIO() - async with websockets.connect(uri) as websocket: - - await websocket.send(json.dumps(payload)) - - async for message in websocket: - if isinstance(message, bytes): - audio_data.write(message) - else: - print("Received a non-binary frame:", message) + await websocket.send(json.dumps(payload)) + + async def listen(): + while 1: + try: + msg = await websocket.recv() + data = json.loads(msg) - return audio_data.getvalue() + if data.get("audio"): + print("audio received") + yield base64.b64decode(data["audio"]) + elif data.get("isFinal"): + break + except websockets.exceptions.ConnectionClosed: + print("websocket closed") + break + listen_task = asyncio.create_task(self.stream(listen())) + + await websocket.send(json.dumps({"text": text})) + # this is to signal the end of the text, either use this or flush + await websocket.send(json.dumps({"text": ""})) + + await listen_task + + async def stream(self, audio_stream): + if self.write_audio: + audio_bytes = BytesIO() + async for chunk in audio_stream: + if chunk: + audio_bytes.write(chunk) + with open("output_audio.mp3", "wb") as f: + f.write(audio_bytes.getvalue()) + + else: + async for chunk in audio_stream: + pass # depends on the streamer! def test_elevenlabs_websocket(): @@ -54,16 +75,7 @@ def test_elevenlabs_websocket(): tts = ElevenlabsTTS() # Call the method with some sample text - audio_base64, lang = tts.text_to_speech("Hello from ElevenLabs WebSocket!") - - print(f"Received language: {lang}") - print(f"Base64 Audio (truncated): {audio_base64[:100]}...") - - # Optional: Save the audio to a local file for manual listening. - # We'll assume the audio is in MP3 format based on "Accept": "audio/mpeg". - audio_bytes = base64.b64decode(audio_base64) - with open("output_audio.mp3", "wb") as f: - f.write(audio_bytes) + tts.text_to_speech("Hello from ElevenLabs WebSocket!") print("Saved audio to output_audio.mp3.") diff --git a/application/tts/output_audio.mp3 b/application/tts/output_audio.mp3 new file mode 100644 index 00000000..fd0bed29 Binary files /dev/null and b/application/tts/output_audio.mp3 differ