Python WebSocket Client

import asyncio
import websockets
import json
import base64
import os

async def generate_speech():
    api_key = os.environ.get("TOGETHER_API_KEY")
    url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart"

    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    async with websockets.connect(url, additional_headers=headers) as ws:
        # Wait for session created
        session_msg = await ws.recv()
        session_data = json.loads(session_msg)
        if session_data.get("type") != "session.created":
            print(f"Failed to start session: {session_data}")
            return
        print(f"Session created: {session_data['session']['id']}")

        # Send text for TTS
        text_chunks = [
            "Hello, this is a test.",
            "This is the second sentence.",
            "And this is the final one."
        ]

        async def send_text():
            for chunk in text_chunks:
                await ws.send(json.dumps({
                    "type": "input_text_buffer.append",
                    "text": chunk
                }))
                await asyncio.sleep(0.5)  # Simulate typing

            # Commit to process any remaining text
            await ws.send(json.dumps({
                "type": "input_text_buffer.commit"
            }))

        async def receive_audio():
            audio_data = bytearray()
            async for message in ws:
                data = json.loads(message)

                if data["type"] == "conversation.item.input_text.received":
                    print(f"Text received: {data['text']}")
                elif data["type"] == "conversation.item.audio_output.delta":
                    # Decode base64 audio chunk
                    audio_chunk = base64.b64decode(data['delta'])
                    audio_data.extend(audio_chunk)
                    print(f"Received audio chunk for item {data['item_id']}")
                elif data["type"] == "conversation.item.audio_output.done":
                    print(f"Audio generation complete for item {data['item_id']}")
                elif data["type"] == "conversation.item.tts.failed":
                    error = data.get("error", {})
                    print(f"Error: {error.get('message')}")
                    break

            # Save the raw PCM samples to a file
            with open("output.pcm", "wb") as f:
                f.write(audio_data)
            print("Audio saved to output.pcm")

        # Run send and receive concurrently
        await asyncio.gather(send_text(), receive_audio())

asyncio.run(generate_speech())

import WebSocket from 'ws';
import fs from 'fs';

const apiKey = process.env.TOGETHER_API_KEY;
const url = 'wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart';

const ws = new WebSocket(url, {
  headers: {
    'Authorization': `Bearer ${apiKey}`
  }
});

const audioData = [];

ws.on('open', () => {
  console.log('WebSocket connection established!');
});

ws.on('message', (data) => {
  const message = JSON.parse(data.toString());

  if (message.type === 'session.created') {
    console.log(`Session created: ${message.session.id}`);

    // Send text chunks
    const textChunks = [
      "Hello, this is a test.",
      "This is the second sentence.",
      "And this is the final one."
    ];

    textChunks.forEach((text, index) => {
      setTimeout(() => {
        ws.send(JSON.stringify({
          type: 'input_text_buffer.append',
          text: text
        }));
      }, index * 500);
    });

    // Commit after all chunks
    setTimeout(() => {
      ws.send(JSON.stringify({
        type: 'input_text_buffer.commit'
      }));
    }, textChunks.length * 500 + 100);

  } else if (message.type === 'conversation.item.input_text.received') {
    console.log(`Text received: ${message.text}`);
  } else if (message.type === 'conversation.item.audio_output.delta') {
    // Decode base64 audio chunk
    const audioChunk = Buffer.from(message.delta, 'base64');
    audioData.push(audioChunk);
    console.log(`Received audio chunk for item ${message.item_id}`);
  } else if (message.type === 'conversation.item.audio_output.done') {
    console.log(`Audio generation complete for item ${message.item_id}`);
  } else if (message.type === 'conversation.item.tts.failed') {
    const errorMessage = message.error?.message ?? 'Unknown error';
    console.error(`Error: ${errorMessage}`);
    ws.close();
  }
});

ws.on('close', () => {
  // Save the raw PCM samples to a file
  if (audioData.length > 0) {
    const completeAudio = Buffer.concat(audioData);
    fs.writeFileSync('output.pcm', completeAudio);
    console.log('Audio saved to output.pcm');
  }
});

ws.on('error', (error) => {
  console.error('WebSocket error:', error);
});

curl --request GET \
  --url https://api.together.ai/v1/audio/speech/websocket \
  --header 'Authorization: Bearer <token>'

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.together.ai/v1/audio/speech/websocket",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "GET",
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"net/http"
	"io"
)

func main() {

	url := "https://api.together.ai/v1/audio/speech/websocket"

	req, _ := http.NewRequest("GET", url, nil)

	req.Header.Add("Authorization", "Bearer <token>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.get("https://api.together.ai/v1/audio/speech/websocket")
  .header("Authorization", "Bearer <token>")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.together.ai/v1/audio/speech/websocket")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Get.new(url)
request["Authorization"] = 'Bearer <token>'

response = http.request(request)
puts response.read_body

Audio

Create realtime text-to-speech

Establishes a WebSocket connection for real-time text-to-speech generation. This endpoint uses WebSocket protocol (wss://api.together.ai/v1/audio/speech/websocket) for bidirectional streaming communication.

Connection Setup:

Protocol: WebSocket (wss://)
Authentication: Pass API key as Bearer token in Authorization header
Parameters: Sent as query parameters (model, voice, max_partial_length, language)

Client Events:

tts_session.updated: Update session parameters like voice. The session object also accepts an extra_params field for additional model-specific parameters that fine-tune speech generation behavior, such as pronunciation_dict (a list of pronunciation rules for specific characters or symbols, where each entry uses the format "<source>/<replacement>" (e.g., ["omg/oh my god"]) to override how the model pronounces matching tokens).
```
{
  "type": "tts_session.updated",
  "session": {
    "voice": "tara",
    "extra_params": {
      "pronunciation_dict": ["omg/oh my god"]
    }
  }
}
```

input_text_buffer.append: Send text chunks for TTS generation

{
  "type": "input_text_buffer.append",
  "text": "Hello, this is a test."
}

input_text_buffer.clear: Clear the buffered text
```
{
  "type": "input_text_buffer.clear"
}
```
input_text_buffer.commit: Signal end of text input and process remaining text
```
{
  "type": "input_text_buffer.commit"
}
```

Server Events:

session.created: Initial session confirmation (sent first)

{
  "event_id": "evt_123456",
  "type": "session.created",
  "session": {
    "id": "session-id",
    "object": "realtime.tts.session",
    "modalities": ["text", "audio"],
    "model": "hexgrad/Kokoro-82M",
    "voice": "tara"
  }
}

conversation.item.input_text.received: Acknowledgment that text was received

{
  "type": "conversation.item.input_text.received",
  "text": "Hello, this is a test."
}

conversation.item.audio_output.delta: Audio chunks as base64-encoded data

{
  "type": "conversation.item.audio_output.delta",
  "item_id": "tts_1",
  "delta": "<base64_encoded_audio_chunk>"
}

conversation.item.audio_output.done: Audio generation complete for an item

{
  "type": "conversation.item.audio_output.done",
  "item_id": "tts_1"
}

conversation.item.tts.failed: Error occurred

{
  "type": "conversation.item.tts.failed",
  "error": {
    "message": "Error description",
    "type": "invalid_request_error",
    "param": null,
    "code": "invalid_api_key"
  }
}

Text Processing:

Partial text (no sentence ending) is held in buffer until:
- We believe that the text is complete enough to be processed for TTS generation
- The partial text exceeds max_partial_length characters (default: 250)
- The input_text_buffer.commit event is received

Audio Format:

Format: Raw PCM (s16le, mono)
Sample Rate: 24000 Hz
Encoding: Base64 (per delta event)
Delivered via conversation.item.audio_output.delta events

Error Codes:

invalid_api_key: Invalid API key provided (401)
missing_api_key: Authorization header missing (401)
model_not_available: Invalid or unavailable model (400)
Invalid text format errors (400)

GET

audio

speech

websocket

Python WebSocket Client

import asyncio
import websockets
import json
import base64
import os

async def generate_speech():
    api_key = os.environ.get("TOGETHER_API_KEY")
    url = "wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart"

    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    async with websockets.connect(url, additional_headers=headers) as ws:
        # Wait for session created
        session_msg = await ws.recv()
        session_data = json.loads(session_msg)
        if session_data.get("type") != "session.created":
            print(f"Failed to start session: {session_data}")
            return
        print(f"Session created: {session_data['session']['id']}")

        # Send text for TTS
        text_chunks = [
            "Hello, this is a test.",
            "This is the second sentence.",
            "And this is the final one."
        ]

        async def send_text():
            for chunk in text_chunks:
                await ws.send(json.dumps({
                    "type": "input_text_buffer.append",
                    "text": chunk
                }))
                await asyncio.sleep(0.5)  # Simulate typing

            # Commit to process any remaining text
            await ws.send(json.dumps({
                "type": "input_text_buffer.commit"
            }))

        async def receive_audio():
            audio_data = bytearray()
            async for message in ws:
                data = json.loads(message)

                if data["type"] == "conversation.item.input_text.received":
                    print(f"Text received: {data['text']}")
                elif data["type"] == "conversation.item.audio_output.delta":
                    # Decode base64 audio chunk
                    audio_chunk = base64.b64decode(data['delta'])
                    audio_data.extend(audio_chunk)
                    print(f"Received audio chunk for item {data['item_id']}")
                elif data["type"] == "conversation.item.audio_output.done":
                    print(f"Audio generation complete for item {data['item_id']}")
                elif data["type"] == "conversation.item.tts.failed":
                    error = data.get("error", {})
                    print(f"Error: {error.get('message')}")
                    break

            # Save the raw PCM samples to a file
            with open("output.pcm", "wb") as f:
                f.write(audio_data)
            print("Audio saved to output.pcm")

        # Run send and receive concurrently
        await asyncio.gather(send_text(), receive_audio())

asyncio.run(generate_speech())

import WebSocket from 'ws';
import fs from 'fs';

const apiKey = process.env.TOGETHER_API_KEY;
const url = 'wss://api.together.ai/v1/audio/speech/websocket?model=hexgrad/Kokoro-82M&voice=af_heart';

const ws = new WebSocket(url, {
  headers: {
    'Authorization': `Bearer ${apiKey}`
  }
});

const audioData = [];

ws.on('open', () => {
  console.log('WebSocket connection established!');
});

ws.on('message', (data) => {
  const message = JSON.parse(data.toString());

  if (message.type === 'session.created') {
    console.log(`Session created: ${message.session.id}`);

    // Send text chunks
    const textChunks = [
      "Hello, this is a test.",
      "This is the second sentence.",
      "And this is the final one."
    ];

    textChunks.forEach((text, index) => {
      setTimeout(() => {
        ws.send(JSON.stringify({
          type: 'input_text_buffer.append',
          text: text
        }));
      }, index * 500);
    });

    // Commit after all chunks
    setTimeout(() => {
      ws.send(JSON.stringify({
        type: 'input_text_buffer.commit'
      }));
    }, textChunks.length * 500 + 100);

  } else if (message.type === 'conversation.item.input_text.received') {
    console.log(`Text received: ${message.text}`);
  } else if (message.type === 'conversation.item.audio_output.delta') {
    // Decode base64 audio chunk
    const audioChunk = Buffer.from(message.delta, 'base64');
    audioData.push(audioChunk);
    console.log(`Received audio chunk for item ${message.item_id}`);
  } else if (message.type === 'conversation.item.audio_output.done') {
    console.log(`Audio generation complete for item ${message.item_id}`);
  } else if (message.type === 'conversation.item.tts.failed') {
    const errorMessage = message.error?.message ?? 'Unknown error';
    console.error(`Error: ${errorMessage}`);
    ws.close();
  }
});

ws.on('close', () => {
  // Save the raw PCM samples to a file
  if (audioData.length > 0) {
    const completeAudio = Buffer.concat(audioData);
    fs.writeFileSync('output.pcm', completeAudio);
    console.log('Audio saved to output.pcm');
  }
});

ws.on('error', (error) => {
  console.error('WebSocket error:', error);
});

curl --request GET \
  --url https://api.together.ai/v1/audio/speech/websocket \
  --header 'Authorization: Bearer <token>'

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.together.ai/v1/audio/speech/websocket",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "GET",
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"net/http"
	"io"
)

func main() {

	url := "https://api.together.ai/v1/audio/speech/websocket"

	req, _ := http.NewRequest("GET", url, nil)

	req.Header.Add("Authorization", "Bearer <token>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.get("https://api.together.ai/v1/audio/speech/websocket")
  .header("Authorization", "Bearer <token>")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.together.ai/v1/audio/speech/websocket")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Get.new(url)
request["Authorization"] = 'Bearer <token>'

response = http.request(request)
puts response.read_body

Multi-context support

All client and server message types support an optional context_id field. This allows you to manage multiple independent TTS streams over a single WebSocket connection.

Field	Type	Required	Description
`context_id`	string	No	Identifies which context this message applies to. Defaults to `"default"` if omitted. For `tts_session.updated`, omitting `context_id` updates all contexts.

Additional client message types

context.cancel — Cancel and clean up a specific context.

{
  "type": "context.cancel",
  "context_id": "conversation-1"
}

Additional server message types

context.cancelled — Confirms a context was cancelled.

{
  "type": "context.cancelled",
  "context_id": "conversation-1"
}

Authorizations

Authorization

string

header

default:default

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Query Parameters

model

enum<string>

default:hexgrad/Kokoro-82M

The TTS model to use for speech generation. Can also be set via tts_session.updated event.

Available options:

hexgrad/Kokoro-82M,

cartesia/sonic-english

voice

string

The voice to use for speech generation. Default is 'tara'. Available voices vary by model. Can also be updated via tts_session.updated event.

max_partial_length

integer

default:250

Maximum number of characters in partial text before forcing TTS generation even without a sentence ending. Helps reduce latency for long text without punctuation.

language

string

default:en

Language or locale of input text. Accepts ISO 639-1 language codes (e.g., en, fr, es, zh) as well as locale codes for region-specific variants. Locale codes must be lowercase (e.g., zh-hk for Cantonese). Can also be set via tts_session.updated event.

Example:

"en"

Response

101

Switching Protocols - WebSocket connection established successfully.

Error message format:

{
  "type": "conversation.item.tts.failed",
  "error": {
    "message": "Error description",
    "type": "invalid_request_error",
    "param": null,
    "code": "error_code"
  }
}

Create audio generation request

Create audio transcription request

⌘I

​Multi-context support

​Additional client message types

​Additional server message types

Authorizations

Query Parameters

Response

Multi-context support

Additional client message types

Additional server message types