This issue provides working example code to generate WAV audio from text using Gemini TTS and present it in a Shiny audio preview modal.
options(ellmer_timeout_s = 900) # for long audio
gemini_tts = function(text, voice="charon", model="gemini-2.5-flash-preview-tts", api_key=NULL) {
api_key = api_key %||% Sys.getenv("GOOGLE_API_KEY")
if (is.null(api_key) || api_key == "") {
stop("Google API key not found. Set GOOGLE_API_KEY or pass api_key as a parameter.")
}
# Internal function to create a WAV header from raw PCM
create_wav_header = function(data_length, sample_rate = 24000, bits_per_sample = 16, channels = 1) {
byte_rate <- sample_rate * channels * bits_per_sample / 8
block_align <- channels * bits_per_sample / 8
header <- raw(44)
# "RIFF" chunk
header[1:4] <- charToRaw("RIFF")
size <- data_length + 36
header[5:8] <- writeBin(as.integer(size), raw(), size = 4, endian = "little")
header[9:12] <- charToRaw("WAVE")
# "fmt " subchunk
header[13:16] <- charToRaw("fmt ")
header[17:20] <- writeBin(16L, raw(), size = 4, endian = "little")
header[21:22] <- writeBin(1L, raw(), size = 2, endian = "little")
header[23:24] <- writeBin(as.integer(channels), raw(), size = 2, endian = "little")
header[25:28] <- writeBin(as.integer(sample_rate), raw(), size = 4, endian = "little")
header[29:32] <- writeBin(as.integer(byte_rate), raw(), size = 4, endian = "little")
header[33:34] <- writeBin(as.integer(block_align), raw(), size = 2, endian = "little")
header[35:36] <- writeBin(as.integer(bits_per_sample), raw(), size = 2, endian = "little")
# "data" subchunk
header[37:40] <- charToRaw("data")
header[41:44] <- writeBin(as.integer(data_length), raw(), size = 4, endian = "little")
header
}
# Build URL
base_url = "https://generativelanguage.googleapis.com/v1beta/models"
url = paste0(base_url, "/", model, ":generateContent")
# Create and execute request
req = httr2::request(url) |>
httr2::req_url_query(key = api_key) |>
httr2::req_timeout(60*15) |>
httr2::req_body_json(list(
contents = list(list(
parts = list(list(text = text))
)),
generationConfig = list(
responseModalities = list("AUDIO"),
speechConfig = list(
voiceConfig = list(
prebuiltVoiceConfig = list(
voiceName = voice
)
)
)
)
))
resp = httr2::req_perform(req)
json_resp = httr2::resp_body_json(resp)
# Extract base64 audio
audio_base64 = json_resp$candidates[[1]]$content$parts[[1]]$inlineData$data
if (is.null(audio_base64)) {
stop("No audio received from the API response")
}
# Decode and create WAV
audio_pcm = base64enc::base64decode(audio_base64)
wav_header = create_wav_header(length(audio_pcm))
# Return complete raw WAV
c(wav_header, audio_pcm)
}
audio_preview_modal = function(wav_b64, title = "Audio Preview (TTS)") {
shiny::modalDialog(
title = title,
size = "m",
easyClose = TRUE,
bslib::card(
bslib::card_header(class = "bg-dark text-light", title),
bslib::card_body(
style = "background-color: #000000; color: #e0e0e0;",
shiny::tags$audio(
controls = NA,
autoplay = NA,
style = "width: 100%;",
shiny::tags$source(
src = paste0("data:audio/wav;base64,", wav_b64),
type = "audio/wav"
)
),
shiny::tags$hr(),
shiny::tags$p(class = "text-muted", "If the audio doesn't start, press Play.")
)
),
footer = shiny::modalButton("Close")
)
}
# MAIN
wav_data = gemini_tts(text = "Hi, I'm Google Gemini Text to Speech.")
wav_b64 = base64enc::base64encode(wav_data)
shiny::showModal(audio_preview_modal(wav_b64))
This issue provides working example code to generate WAV audio from text using Gemini TTS and present it in a Shiny audio preview modal.
Example Code
Instructions:
voiceandmodelparameters as needed for your application.