Skip to content

Example code to generate WAV audio from text using Gemini TTS and present it in a Shiny audio preview modal #937

@diegoperoni

Description

@diegoperoni

This issue provides working example code to generate WAV audio from text using Gemini TTS and present it in a Shiny audio preview modal.

Example Code

options(ellmer_timeout_s = 900) # for long audio

gemini_tts = function(text, voice="charon", model="gemini-2.5-flash-preview-tts", api_key=NULL) {
  api_key = api_key %||% Sys.getenv("GOOGLE_API_KEY")
  if (is.null(api_key) || api_key == "") {
    stop("Google API key not found. Set GOOGLE_API_KEY or pass api_key as a parameter.")
  }
  # Internal function to create a WAV header from raw PCM
  create_wav_header = function(data_length, sample_rate = 24000, bits_per_sample = 16, channels = 1) {
    byte_rate <- sample_rate * channels * bits_per_sample / 8
    block_align <- channels * bits_per_sample / 8
    header <- raw(44)
    # "RIFF" chunk
    header[1:4] <- charToRaw("RIFF")
    size <- data_length + 36
    header[5:8] <- writeBin(as.integer(size), raw(), size = 4, endian = "little")
    header[9:12] <- charToRaw("WAVE")
    # "fmt " subchunk
    header[13:16] <- charToRaw("fmt ")
    header[17:20] <- writeBin(16L, raw(), size = 4, endian = "little")
    header[21:22] <- writeBin(1L, raw(), size = 2, endian = "little")
    header[23:24] <- writeBin(as.integer(channels), raw(), size = 2, endian = "little")
    header[25:28] <- writeBin(as.integer(sample_rate), raw(), size = 4, endian = "little")
    header[29:32] <- writeBin(as.integer(byte_rate), raw(), size = 4, endian = "little")
    header[33:34] <- writeBin(as.integer(block_align), raw(), size = 2, endian = "little")
    header[35:36] <- writeBin(as.integer(bits_per_sample), raw(), size = 2, endian = "little")
    # "data" subchunk
    header[37:40] <- charToRaw("data")
    header[41:44] <- writeBin(as.integer(data_length), raw(), size = 4, endian = "little")
    header
  }
  # Build URL
  base_url = "https://generativelanguage.googleapis.com/v1beta/models"
  url = paste0(base_url, "/", model, ":generateContent")
  # Create and execute request
  req = httr2::request(url) |>
    httr2::req_url_query(key = api_key) |>
    httr2::req_timeout(60*15) |>
    httr2::req_body_json(list(
      contents = list(list(
        parts = list(list(text = text))
      )),
      generationConfig = list(
        responseModalities = list("AUDIO"),
        speechConfig = list(
          voiceConfig = list(
            prebuiltVoiceConfig = list(
              voiceName = voice
            )
          )
        )
      )
    ))
  resp = httr2::req_perform(req)
  json_resp = httr2::resp_body_json(resp)
  # Extract base64 audio
  audio_base64 = json_resp$candidates[[1]]$content$parts[[1]]$inlineData$data
  if (is.null(audio_base64)) {
    stop("No audio received from the API response")
  }
  # Decode and create WAV
  audio_pcm  = base64enc::base64decode(audio_base64)
  wav_header = create_wav_header(length(audio_pcm))
  # Return complete raw WAV
  c(wav_header, audio_pcm)
}

audio_preview_modal = function(wav_b64, title = "Audio Preview (TTS)") {
  shiny::modalDialog(
    title = title,
    size = "m",
    easyClose = TRUE,
    bslib::card(
      bslib::card_header(class = "bg-dark text-light", title),
      bslib::card_body(
        style = "background-color: #000000; color: #e0e0e0;",
        shiny::tags$audio(
          controls = NA,
          autoplay = NA,
          style = "width: 100%;",
          shiny::tags$source(
            src = paste0("data:audio/wav;base64,", wav_b64),
            type = "audio/wav"
          )
        ),
        shiny::tags$hr(),
        shiny::tags$p(class = "text-muted", "If the audio doesn't start, press Play.")
      )
    ),
    footer = shiny::modalButton("Close")
  )
}

# MAIN
wav_data = gemini_tts(text = "Hi, I'm Google Gemini Text to Speech.")
wav_b64 = base64enc::base64encode(wav_data)
shiny::showModal(audio_preview_modal(wav_b64))

Instructions:

  • Make sure you have your Google API key properly set (see code comments).
  • Adjust voice and model parameters as needed for your application.
  • Use the audio preview modal in your Shiny app to allow users to listen and download generated audio.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions