initial websocket impl

2025-12-01 17:43:15 +00:00 · 2025-01-13 00:33:10 -05:00
parent 628784da35
commit a4eb4ea66d
1 changed files with 64 additions and 20 deletions
--- a/application/tts/elevenlabs.py
+++ b/application/tts/elevenlabs.py
@@ -1,29 +1,73 @@
-from io import BytesIO
+import asyncio
+import websockets
+import json
 import base64
+from io import BytesIO
 from application.tts.base import BaseTTS


 class ElevenlabsTTS(BaseTTS):
-    def __init__(self):
-        from elevenlabs.client import ElevenLabs
-
-        self.client = ElevenLabs(
-            api_key="ELEVENLABS_API_KEY",
-            )
-    
+    def __init__(self):        
+        self.api_key = "ELEVENLABS_API_KEY"
+        self.model = "eleven_multilingual_v2"
+        self.voice = "Brian"

    def text_to_speech(self, text):
-        lang = "en"
-        audio = self.client.generate(
-            text=text,
-            model="eleven_multilingual_v2",
-            voice="Brian",
-        )
-        audio_data = BytesIO()
-        for chunk in audio:
-            audio_data.write(chunk)
-        audio_bytes = audio_data.getvalue()
-
-        # Encode to base64
+        audio_bytes = asyncio.run(self._text_to_speech_websocket(text))
        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+        lang = "en"
        return audio_base64, lang
+
+    async def _text_to_speech_websocket(self, text):
+        uri = "wss://api.elevenlabs.io/v1/tts-stream"
+        headers = {
+            "xi-api-key": self.api_key,
+            "Accept": "audio/mpeg"
+        }
+        payload = {
+            "text": text,
+            "model_id": self.model,
+            "voice_settings": {
+                "voice_id": self.voice
+            },
+        }
+        audio_data = BytesIO()
+
+        async with websockets.connect(uri, extra_headers=headers) as websocket:
+            
+            await websocket.send(json.dumps(payload))
+            
+            async for message in websocket:
+                if isinstance(message, bytes):
+                    audio_data.write(message)
+                else:
+                    print("Received a non-binary frame:", message)
+
+        return audio_data.getvalue()
+
+
+def test_elevenlabs_websocket():
+    """
+    Tests the ElevenlabsTTS text_to_speech method with a sample prompt.
+    Prints out the base64-encoded result and writes it to 'output_audio.mp3'.
+    """
+    # Instantiate your TTS class
+    tts = ElevenlabsTTS()
+
+    # Call the method with some sample text
+    audio_base64, lang = tts.text_to_speech("Hello from ElevenLabs WebSocket!")
+
+    print(f"Received language: {lang}")
+    print(f"Base64 Audio (truncated): {audio_base64[:100]}...")
+
+    # Optional: Save the audio to a local file for manual listening.
+    # We'll assume the audio is in MP3 format based on "Accept": "audio/mpeg".
+    audio_bytes = base64.b64decode(audio_base64)
+    with open("output_audio.mp3", "wb") as f:
+        f.write(audio_bytes)
+
+    print("Saved audio to output_audio.mp3.")
+
+
+if __name__ == "__main__":
+    test_elevenlabs_websocket()