使用多模態 AI 模型轉錄音訊檔案

這個範例說明如何使用音訊檔案生成附有時間戳記的 Podcast 轉錄稿。

深入探索

如需包含這個程式碼範例的詳細說明文件，請參閱下列文章：

音訊理解 (僅限語音)

程式碼範例

Go

在試用這個範例之前，請先按照「使用用戶端程式庫的 Vertex AI 快速入門導覽課程」中的 Go 設定說明操作。詳情請參閱 Vertex AI Go API 參考文件。

如要向 Vertex AI 進行驗證，請設定應用程式預設憑證。詳情請參閱「為本機開發環境設定驗證機制」。

import (
	"context"
	"fmt"
	"io"

	genai "google.golang.org/genai"
)

// generateAudioTranscript shows how to generate an audio transcript.
func generateAudioTranscript(w io.Writer) error {
	ctx := context.Background()

	client, err := genai.NewClient(ctx, &genai.ClientConfig{
		HTTPOptions: genai.HTTPOptions{APIVersion: "v1"},
	})
	if err != nil {
		return fmt.Errorf("failed to create genai client: %w", err)
	}

	modelName := "gemini-2.5-flash"
	contents := []*genai.Content{
		{Parts: []*genai.Part{
			{Text: `Transcribe the interview, in the format of timecode, speaker, caption.
Use speaker A, speaker B, etc. to identify speakers.`},
			{FileData: &genai.FileData{
				FileURI:  "gs://cloud-samples-data/generative-ai/audio/pixel.mp3",
				MIMEType: "audio/mpeg",
			}},
		},
			Role: genai.RoleUser},
	}

	resp, err := client.Models.GenerateContent(ctx, modelName, contents, nil)
	if err != nil {
		return fmt.Errorf("failed to generate content: %w", err)
	}

	respText := resp.Text()

	fmt.Fprintln(w, respText)

	// Example response:
	// 00:00:00, A: your devices are getting better over time.
	// 00:01:13, A: And so we think about it across the entire portfolio from phones to watch, ...
	// ...

	return nil
}

Java

在試用這個範例之前，請先按照「使用用戶端程式庫的 Vertex AI 快速入門導覽課程」中的 Java 設定說明操作。詳情請參閱 Vertex AI Java API 參考文件。

如要向 Vertex AI 進行驗證，請設定應用程式預設憑證。詳情請參閱「為本機開發環境設定驗證機制」。


import com.google.genai.Client;
import com.google.genai.types.Content;
import com.google.genai.types.GenerateContentConfig;
import com.google.genai.types.GenerateContentResponse;
import com.google.genai.types.HttpOptions;
import com.google.genai.types.Part;

public class TextGenerationTranscriptWithGcsAudio {

  public static void main(String[] args) {
    // TODO(developer): Replace these variables before running the sample.
    String modelId = "gemini-2.5-flash";
    generateContent(modelId);
  }

  // Generates transcript with audio input
  public static String generateContent(String modelId) {
    // Client Initialization. Once created, it can be reused for multiple requests.
    try (Client client =
        Client.builder()
            .location("global")
            .vertexAI(true)
            .httpOptions(HttpOptions.builder().apiVersion("v1").build())
            .build()) {

      String prompt =
          "Transcribe the interview, in the format of timecode, speaker, caption.\n"
              + "Use speaker A, speaker B, etc. to identify speakers.";

      // Enable audioTimestamp to generate timestamps for audio-only files.
      GenerateContentConfig contentConfig =
          GenerateContentConfig.builder().audioTimestamp(true).build();

      GenerateContentResponse response =
          client.models.generateContent(
              modelId,
              Content.fromParts(
                  Part.fromUri(
                      "gs://cloud-samples-data/generative-ai/audio/pixel.mp3", "audio/mpeg"),
                  Part.fromText(prompt)),
              contentConfig);

      System.out.print(response.text());
      // Example response:
      // 00:00 - Speaker A: your devices are getting better over time. And so we think about it...
      // 00:14 - Speaker B: Welcome to the Made by Google Podcast, where we meet the people who...
      // 00:41 - Speaker A: So many features. I am a singer, so I actually think recorder...
      return response.text();
    }
  }
}

Node.js

在試用這個範例之前，請先按照「使用用戶端程式庫的 Vertex AI 快速入門導覽課程」中的 Node.js 設定說明操作。詳情請參閱 Vertex AI Node.js API 參考文件。

如要向 Vertex AI 進行驗證，請設定應用程式預設憑證。詳情請參閱「為本機開發環境設定驗證機制」。

const {GoogleGenAI} = require('@google/genai');

const GOOGLE_CLOUD_PROJECT = process.env.GOOGLE_CLOUD_PROJECT;
const GOOGLE_CLOUD_LOCATION = process.env.GOOGLE_CLOUD_LOCATION || 'global';

async function generateText(
  projectId = GOOGLE_CLOUD_PROJECT,
  location = GOOGLE_CLOUD_LOCATION
) {
  const client = new GoogleGenAI({
    vertexai: true,
    project: projectId,
    location: location,
  });

  const prompt = `Transcribe the interview, in the format of timecode, speaker, caption.
    Use speaker A, speaker B, etc. to identify speakers.`;

  const response = await client.models.generateContent({
    model: 'gemini-2.5-flash',
    contents: [
      {text: prompt},
      {
        fileData: {
          fileUri: 'gs://cloud-samples-data/generative-ai/audio/pixel.mp3',
          mimeType: 'audio/mpeg',
        },
      },
    ],
    // Required to enable timestamp understanding for audio-only files
    config: {
      audioTimestamp: true,
    },
  });

  console.log(response.text);

  // Example response:
  // [00:00:00] **Speaker A:** your devices are getting better over time. And so ...
  // [00:00:14] **Speaker B:** Welcome to the Made by Google podcast where we meet ...
  // [00:00:20] **Speaker B:** Here's your host, Rasheed Finch.
  // [00:00:23] **Speaker C:** Today we're talking to Aisha Sharif and DeCarlos Love. ...
  // ...

  return response.text;
}

Python

在試用這個範例之前，請先按照「使用用戶端程式庫的 Vertex AI 快速入門導覽課程」中的 Python 設定說明操作。詳情請參閱 Vertex AI Python API 參考文件。

如要向 Vertex AI 進行驗證，請設定應用程式預設憑證。詳情請參閱「為本機開發環境設定驗證機制」。

from google import genai
from google.genai.types import GenerateContentConfig, HttpOptions, Part

client = genai.Client(http_options=HttpOptions(api_version="v1"))
prompt = """
Transcribe the interview, in the format of timecode, speaker, caption.
Use speaker A, speaker B, etc. to identify speakers.
"""
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[
        prompt,
        Part.from_uri(
            file_uri="gs://cloud-samples-data/generative-ai/audio/pixel.mp3",
            mime_type="audio/mpeg",
        ),
    ],
    # Required to enable timestamp understanding for audio-only files
    config=GenerateContentConfig(audio_timestamp=True),
)
print(response.text)
# Example response:
# [00:00:00] **Speaker A:** your devices are getting better over time. And so ...
# [00:00:14] **Speaker B:** Welcome to the Made by Google podcast where we meet ...
# [00:00:20] **Speaker B:** Here's your host, Rasheed Finch.
# [00:00:23] **Speaker C:** Today we're talking to Aisha Sharif and DeCarlos Love. ...
# ...

後續步驟

如要搜尋及篩選其他 Google Cloud 產品的程式碼範例，請參閱Google Cloud 瀏覽器範例。

使用多模態 AI 模型轉錄音訊檔案 透過集合功能整理內容 你可以依據偏好儲存及分類內容。

深入探索

程式碼範例

Go

Java

Node.js

Python

後續步驟

使用多模態 AI 模型轉錄音訊檔案