Using Your Own OpenAI Key for LLM
To integrate a custom OpenAI key, create a secret containing your OPENAI_API_KEY:
Navigate to the “Secrets” page and select “Add Secret”
Choose “Custom LLM” from the dropdown menu.
Enter the URL, your model, and the secret you created.
Custom LLM Server
To bring a custom LLM server, set up a compatible server endpoint using OpenAI’s style, specifically targeting create_chat_completion.
Here’s an example server implementation using FastAPI and OpenAI’s Python SDK:
import json
import os
import fastapi
from fastapi.responses import StreamingResponse
from openai import AsyncOpenAI
import uvicorn
import logging
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional
# Load environment variables from .env file
load_dotenv()
# Retrieve API key from environment
OPENAI_API_KEY = os.getenv( 'OPENAI_API_KEY' )
if not OPENAI_API_KEY :
raise ValueError ( "OPENAI_API_KEY not found in environment variables" )
app = fastapi.FastAPI()
oai_client = AsyncOpenAI( api_key = OPENAI_API_KEY )
class Message ( BaseModel ):
role: str
content: str
class ChatCompletionRequest ( BaseModel ):
messages: List[Message]
model: str
temperature: Optional[ float ] = 0.7
max_tokens: Optional[ int ] = None
stream: Optional[ bool ] = False
user_id: Optional[ str ] = None
@app.post ( "/v1/chat/completions" )
async def create_chat_completion ( request : ChatCompletionRequest) -> StreamingResponse:
oai_request = request.dict( exclude_none = True )
if "user_id" in oai_request:
oai_request[ "user" ] = oai_request.pop( "user_id" )
chat_completion_coroutine = await oai_client.chat.completions.create( ** oai_request)
async def event_stream ():
try :
async for chunk in chat_completion_coroutine:
# Convert the ChatCompletionChunk to a dictionary before JSON serialization
chunk_dict = chunk.model_dump()
yield f "data: { json.dumps(chunk_dict) } \n\n "
yield "data: [DONE] \n\n "
except Exception as e:
logging.error( "An error occurred: %s " , str (e))
yield f "data: { json.dumps({ 'error' : 'Internal error occurred!' }) } \n\n "
return StreamingResponse(event_stream(), media_type = "text/event-stream" )
if __name__ == "__main__" :
uvicorn.run(app, host = "0.0.0.0" , port = 8013 )
Run this code or your own server code.
Setting Up a Public URL for Your Server
To make your server accessible, create a public URL using a tunneling tool like ngrok:
ngrok http --url= < Your ur l > .ngrok.app 8013
Configuring Elevenlabs CustomLLM
Now let’s make the changes in Elevenlabs
Direct your server URL to ngrok endpoint, setup “Limit token usage” to 5000 and
you can setup CustomLLM extra body to true if you want to pass additional parameters to your custom LLM implementation at conversation start.
You can start interacting with Conversational AI with your own LLM server
Additional Features
You may pass additional parameters to your custom LLM implementation.
Define the Extra Parameters
Create an object containing your custom parameters:
from elevenlabs.conversational_ai.conversation import Conversation, ConversationConfig
extra_body_for_convai = {
"UUID" : "123e4567-e89b-12d3-a456-426614174000" ,
"parameter-1" : "value-1" ,
"parameter-2" : "value-2" ,
}
config = ConversationConfig(
extra_body = extra_body_for_convai,
)
Update the LLM Implementation
Modify your custom LLM code to handle the additional parameters:
import json
import os
import fastapi
from fastapi.responses import StreamingResponse
from fastapi import Request
from openai import AsyncOpenAI
import uvicorn
import logging
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional
# Load environment variables from .env file
load_dotenv()
# Retrieve API key from environment
OPENAI_API_KEY = os.getenv( 'OPENAI_API_KEY' )
if not OPENAI_API_KEY :
raise ValueError ( "OPENAI_API_KEY not found in environment variables" )
app = fastapi.FastAPI()
oai_client = AsyncOpenAI( api_key = OPENAI_API_KEY )
class Message ( BaseModel ):
role: str
content: str
class ChatCompletionRequest ( BaseModel ):
messages: List[Message]
model: str
temperature: Optional[ float ] = 0.7
max_tokens: Optional[ int ] = None
stream: Optional[ bool ] = False
user_id: Optional[ str ] = None
elevenlabs_extra_body: Optional[ dict ] = None
@app.post ( "/v1/chat/completions" )
async def create_chat_completion ( request : ChatCompletionRequest) -> StreamingResponse:
oai_request = request.dict( exclude_none = True )
print (oai_request)
if "user_id" in oai_request:
oai_request[ "user" ] = oai_request.pop( "user_id" )
if "elevenlabs_extra_body" in oai_request:
oai_request.pop( "elevenlabs_extra_body" )
chat_completion_coroutine = await oai_client.chat.completions.create( ** oai_request)
async def event_stream ():
try :
async for chunk in chat_completion_coroutine:
chunk_dict = chunk.model_dump()
yield f "data: { json.dumps(chunk_dict) } \n\n "
yield "data: [DONE] \n\n "
except Exception as e:
logging.error( "An error occurred: %s " , str (e))
yield f "data: { json.dumps({ 'error' : 'Internal error occurred!' }) } \n\n "
return StreamingResponse(event_stream(), media_type = "text/event-stream" )
if __name__ == "__main__" :
uvicorn.run(app, host = "0.0.0.0" , port = 8013 )
Example Request With this custom message setup, your LLM will receive requests in this format:
{
"messages" : [
{
"role" : "system" ,
"content" : " \n <Redacted>"
},
{
"role" : "assistant" ,
"content" : "Hey I'm currently unavailable."
},
{
"role" : "user" ,
"content" : "Hey, who are you?"
}
],
"model" : "gpt-4o" ,
"temperature" : 0.5 ,
"max_tokens" : 5000 ,
"stream" : true ,
"elevenlabs_extra_body" : {
"UUID" : "123e4567-e89b-12d3-a456-426614174000" ,
"parameter-1" : "value-1" ,
"parameter-2" : "value-2"
}
}
Define the Extra Parameters
Create an object containing your custom parameters:
from elevenlabs.conversational_ai.conversation import Conversation, ConversationConfig
extra_body_for_convai = {
"UUID" : "123e4567-e89b-12d3-a456-426614174000" ,
"parameter-1" : "value-1" ,
"parameter-2" : "value-2" ,
}
config = ConversationConfig(
extra_body = extra_body_for_convai,
)
Update the LLM Implementation
Modify your custom LLM code to handle the additional parameters:
import json
import os
import fastapi
from fastapi.responses import StreamingResponse
from fastapi import Request
from openai import AsyncOpenAI
import uvicorn
import logging
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional
# Load environment variables from .env file
load_dotenv()
# Retrieve API key from environment
OPENAI_API_KEY = os.getenv( 'OPENAI_API_KEY' )
if not OPENAI_API_KEY :
raise ValueError ( "OPENAI_API_KEY not found in environment variables" )
app = fastapi.FastAPI()
oai_client = AsyncOpenAI( api_key = OPENAI_API_KEY )
class Message ( BaseModel ):
role: str
content: str
class ChatCompletionRequest ( BaseModel ):
messages: List[Message]
model: str
temperature: Optional[ float ] = 0.7
max_tokens: Optional[ int ] = None
stream: Optional[ bool ] = False
user_id: Optional[ str ] = None
elevenlabs_extra_body: Optional[ dict ] = None
@app.post ( "/v1/chat/completions" )
async def create_chat_completion ( request : ChatCompletionRequest) -> StreamingResponse:
oai_request = request.dict( exclude_none = True )
print (oai_request)
if "user_id" in oai_request:
oai_request[ "user" ] = oai_request.pop( "user_id" )
if "elevenlabs_extra_body" in oai_request:
oai_request.pop( "elevenlabs_extra_body" )
chat_completion_coroutine = await oai_client.chat.completions.create( ** oai_request)
async def event_stream ():
try :
async for chunk in chat_completion_coroutine:
chunk_dict = chunk.model_dump()
yield f "data: { json.dumps(chunk_dict) } \n\n "
yield "data: [DONE] \n\n "
except Exception as e:
logging.error( "An error occurred: %s " , str (e))
yield f "data: { json.dumps({ 'error' : 'Internal error occurred!' }) } \n\n "
return StreamingResponse(event_stream(), media_type = "text/event-stream" )
if __name__ == "__main__" :
uvicorn.run(app, host = "0.0.0.0" , port = 8013 )
Example Request With this custom message setup, your LLM will receive requests in this format:
{
"messages" : [
{
"role" : "system" ,
"content" : " \n <Redacted>"
},
{
"role" : "assistant" ,
"content" : "Hey I'm currently unavailable."
},
{
"role" : "user" ,
"content" : "Hey, who are you?"
}
],
"model" : "gpt-4o" ,
"temperature" : 0.5 ,
"max_tokens" : 5000 ,
"stream" : true ,
"elevenlabs_extra_body" : {
"UUID" : "123e4567-e89b-12d3-a456-426614174000" ,
"parameter-1" : "value-1" ,
"parameter-2" : "value-2"
}
}