Using Your Own OpenAI Key for LLM

To integrate a custom OpenAI key, create a secret containing your OPENAI_API_KEY:
1

Navigate to the “Secrets” page and select “Add Secret”
2

Choose “Custom LLM” from the dropdown menu.
3

Enter the URL, your model, and the secret you created.

Custom LLM Server

To bring a custom LLM server, set up a compatible server endpoint using OpenAI’s style, specifically targeting create_chat_completion. Here’s an example server implementation using FastAPI and OpenAI’s Python SDK:
import json
import os
import fastapi
from fastapi.responses import StreamingResponse
from openai import AsyncOpenAI
import uvicorn
import logging
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

# Load environment variables from .env file
load_dotenv()

# Retrieve API key from environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

app = fastapi.FastAPI()
oai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    messages: List[Message]
    model: str
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = None
    stream: Optional[bool] = False
    user_id: Optional[str] = None

@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest) -> StreamingResponse:
    oai_request = request.dict(exclude_none=True)
    if "user_id" in oai_request:
        oai_request["user"] = oai_request.pop("user_id")

    chat_completion_coroutine = await oai_client.chat.completions.create(**oai_request)

    async def event_stream():
        try:
            async for chunk in chat_completion_coroutine:
                # Convert the ChatCompletionChunk to a dictionary before JSON serialization
                chunk_dict = chunk.model_dump()
                yield f"data: {json.dumps(chunk_dict)}\n\n"
            yield "data: [DONE]\n\n"
        except Exception as e:
            logging.error("An error occurred: %s", str(e))
            yield f"data: {json.dumps({'error': 'Internal error occurred!'})}\n\n"

    return StreamingResponse(event_stream(), media_type="text/event-stream")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8013)
Run this code or your own server code.

Setting Up a Public URL for Your Server

To make your server accessible, create a public URL using a tunneling tool like ngrok:
ngrok http --url=<Your url>.ngrok.app 8013

Configuring Elevenlabs CustomLLM

Now let’s make the changes in Elevenlabs
Direct your server URL to ngrok endpoint, setup “Limit token usage” to 5000 and you can setup CustomLLM extra body to true if you want to pass additional parameters to your custom LLM implementation at conversation start. You can start interacting with Conversational AI with your own LLM server

Additional Features

You may pass additional parameters to your custom LLM implementation.
1

Define the Extra Parameters

Create an object containing your custom parameters:
from elevenlabs.conversational_ai.conversation import Conversation, ConversationConfig

extra_body_for_convai = {
    "UUID": "123e4567-e89b-12d3-a456-426614174000",
    "parameter-1": "value-1",
    "parameter-2": "value-2",
}

config = ConversationConfig(
    extra_body=extra_body_for_convai,
)
2

Update the LLM Implementation

Modify your custom LLM code to handle the additional parameters:
import json
import os
import fastapi
from fastapi.responses import StreamingResponse
from fastapi import Request
from openai import AsyncOpenAI
import uvicorn
import logging
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

# Load environment variables from .env file
load_dotenv()

# Retrieve API key from environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

app = fastapi.FastAPI()
oai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    messages: List[Message]
    model: str
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = None
    stream: Optional[bool] = False
    user_id: Optional[str] = None
    elevenlabs_extra_body: Optional[dict] = None

@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest) -> StreamingResponse:
    oai_request = request.dict(exclude_none=True)
    print(oai_request)
    if "user_id" in oai_request:
        oai_request["user"] = oai_request.pop("user_id")

    if "elevenlabs_extra_body" in oai_request:
        oai_request.pop("elevenlabs_extra_body")

    chat_completion_coroutine = await oai_client.chat.completions.create(**oai_request)

    async def event_stream():
        try:
            async for chunk in chat_completion_coroutine:
                chunk_dict = chunk.model_dump()
                yield f"data: {json.dumps(chunk_dict)}\n\n"
            yield "data: [DONE]\n\n"
        except Exception as e:
            logging.error("An error occurred: %s", str(e))
            yield f"data: {json.dumps({'error': 'Internal error occurred!'})}\n\n"

    return StreamingResponse(event_stream(), media_type="text/event-stream")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8013)

Example Request

With this custom message setup, your LLM will receive requests in this format:
{
  "messages": [
    {
      "role": "system",
      "content": "\n  <Redacted>"
    },
    {
      "role": "assistant",
      "content": "Hey I'm currently unavailable."
    },
    {
      "role": "user",
      "content": "Hey, who are you?"
    }
  ],
  "model": "gpt-4o",
  "temperature": 0.5,
  "max_tokens": 5000,
  "stream": true,
  "elevenlabs_extra_body": {
    "UUID": "123e4567-e89b-12d3-a456-426614174000",
    "parameter-1": "value-1",
    "parameter-2": "value-2"
  }
}