feat: Add support for audio in chat completions in openai_dart (#577)

davidmigloz · Oct 21, 2024 · 0fb058c · 0fb058c
1 parent 45b9f42
commit 0fb058c
Show file tree

Hide file tree

Showing 38 changed files with 21,565 additions and 14,753 deletions.
diff --git a/packages/openai_dart/README.md b/packages/openai_dart/README.md
@@ -22,7 +22,7 @@ Unofficial Dart client for [OpenAI](https://platform.openai.com/docs/api-referen
 
 **Supported endpoints:**
 
-- Chat (with structured outputs, tools and streaming support)
+- Chat (with text, image, audio, structured outputs, tools and streaming support)
 - Completions (legacy)
 - Embeddings
 - Fine-tuning
@@ -223,6 +223,80 @@ ChatCompletionMessage.user(
 //...
 ```
 
+In addition to generating text and images, some models enable you to generate a spoken audio response to a prompt:
+
+```dart
+final res = await client.createChatCompletion(
+  request: CreateChatCompletionRequest(
+    model: ChatCompletionModel.model(
+      ChatCompletionModels.gpt4oAudioPreview,
+    ),
+    modalities: [
+      ChatCompletionModality.text,
+      ChatCompletionModality.audio,
+    ],
+    audio: ChatCompletionAudioOptions(
+      voice: ChatCompletionAudioVoice.alloy,
+      format: ChatCompletionAudioFormat.wav,
+    ),
+    messages: [
+      ChatCompletionMessage.user(
+        content: ChatCompletionUserMessageContent.string(
+          'Is a golden retriever a good family dog?',
+        ),
+      ),
+    ],
+  ), 
+);
+final choice = res.choices.first;
+final audio = choice.message.audio;
+print(audio?.id);
+print(audio?.expiresAt);
+print(audio?.transcript);
+print(audio?.data);
+```
+
+And to use audio inputs to prompt the model:
+
+```dart
+final res = await client.createChatCompletion(
+  request: CreateChatCompletionRequest(
+    model: ChatCompletionModel.model(
+      ChatCompletionModels.gpt4oAudioPreview,
+    ),
+    modalities: [
+      ChatCompletionModality.text,
+      ChatCompletionModality.audio,
+    ],
+    audio: ChatCompletionAudioOptions(
+      voice: ChatCompletionAudioVoice.alloy,
+      format: ChatCompletionAudioFormat.wav,
+    ),
+    messages: [
+      ChatCompletionMessage.user(
+        content: ChatCompletionUserMessageContent.parts([
+          ChatCompletionMessageContentPart.text(
+            text: 'Do what the recording says',
+          ),
+          ChatCompletionMessageContentPart.audio(
+            inputAudio: ChatCompletionMessageInputAudio(
+              data: 'UklGRoYZAQBXQVZFZm10I...//X//v8FAOj/GAD+/7z/',
+              format: ChatCompletionMessageInputAudioFormat.wav,
+            ),
+          ),
+        ]),
+      ),
+    ],
+  );
+);
+final choice = res.choices.first;
+final audio = choice.message.audio;
+print(audio?.id);
+print(audio?.expiresAt);
+print(audio?.transcript);
+print(audio?.data);
+```
+
 **Structured output: ([docs](https://platform.openai.com/docs/guides/structured-outputs))**
 
 Structured Outputs is a feature that ensures the model will always generate responses that adhere to your supplied JSON Schema.

diff --git a/packages/openai_dart/lib/src/generated/schema/assistant_object.dart b/packages/openai_dart/lib/src/generated/schema/assistant_object.dart
@@ -40,13 +40,15 @@ class AssistantObject with _$AssistantObject {
     /// types `code_interpreter`, `file_search`, or `function`.
     required List<AssistantTools> tools,
 
-    /// A set of resources that are made available to the assistant's tools in this thread. The resources are specific to the type of tool. For example, the `code_interpreter` tool requires a list of file IDs, while the `file_search` tool requires a list of vector store IDs.
+    /// A set of resources that are made available to the assistant's tools in this thread. The resources are specific
+    /// to the type of tool. For example, the `code_interpreter` tool requires a list of file IDs, while the
+    /// `file_search` tool requires a list of vector store IDs.
     @JsonKey(name: 'tool_resources', includeIfNull: false)
     ToolResources? toolResources,
 
     /// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional
     /// information about the object in a structured format. Keys can be a maximum of 64 characters long and values
-    /// can be a maxium of 512 characters long.
+    /// can be a maximum of 512 characters long.
     required Map<String, dynamic>? metadata,
 
     /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random,

diff --git a/packages/openai_dart/lib/src/generated/schema/batch.dart b/packages/openai_dart/lib/src/generated/schema/batch.dart
@@ -76,7 +76,7 @@ class Batch with _$Batch {
 
     /// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional
     /// information about the object in a structured format. Keys can be a maximum of 64 characters long and values
-    /// can be a maxium of 512 characters long.
+    /// can be a maximum of 512 characters long.
     @JsonKey(includeIfNull: false) dynamic metadata,
   }) = _Batch;
 

diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_format.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_format.dart
@@ -0,0 +1,23 @@
+// coverage:ignore-file
+// GENERATED CODE - DO NOT MODIFY BY HAND
+// ignore_for_file: type=lint
+// ignore_for_file: invalid_annotation_target
+part of open_a_i_schema;
+
+// ==========================================
+// ENUM: ChatCompletionAudioFormat
+// ==========================================
+
+/// Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
+enum ChatCompletionAudioFormat {
+  @JsonValue('wav')
+  wav,
+  @JsonValue('mp3')
+  mp3,
+  @JsonValue('flac')
+  flac,
+  @JsonValue('opus')
+  opus,
+  @JsonValue('pcm16')
+  pcm16,
+}
diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_options.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_options.dart
@@ -0,0 +1,45 @@
+// coverage:ignore-file
+// GENERATED CODE - DO NOT MODIFY BY HAND
+// ignore_for_file: type=lint
+// ignore_for_file: invalid_annotation_target
+part of open_a_i_schema;
+
+// ==========================================
+// CLASS: ChatCompletionAudioOptions
+// ==========================================
+
+/// Parameters for audio output. Required when audio output is requested with `modalities: ["audio"]`.
+/// [Learn more](https://platform.openai.com/docs/guides/audio).
+@freezed
+class ChatCompletionAudioOptions with _$ChatCompletionAudioOptions {
+  const ChatCompletionAudioOptions._();
+
+  /// Factory constructor for ChatCompletionAudioOptions
+  const factory ChatCompletionAudioOptions({
+    /// Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`.
+    required ChatCompletionAudioVoice voice,
+
+    /// Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
+    required ChatCompletionAudioFormat format,
+  }) = _ChatCompletionAudioOptions;
+
+  /// Object construction from a JSON representation
+  factory ChatCompletionAudioOptions.fromJson(Map<String, dynamic> json) =>
+      _$ChatCompletionAudioOptionsFromJson(json);
+
+  /// List of all property names of schema
+  static const List<String> propertyNames = ['voice', 'format'];
+
+  /// Perform validations on the schema property values
+  String? validateSchema() {
+    return null;
+  }
+
+  /// Map representation of object (not serialized)
+  Map<String, dynamic> toMap() {
+    return {
+      'voice': voice,
+      'format': format,
+    };
+  }
+}
diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_voice.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_audio_voice.dart
@@ -0,0 +1,25 @@
+// coverage:ignore-file
+// GENERATED CODE - DO NOT MODIFY BY HAND
+// ignore_for_file: type=lint
+// ignore_for_file: invalid_annotation_target
+part of open_a_i_schema;
+
+// ==========================================
+// ENUM: ChatCompletionAudioVoice
+// ==========================================
+
+/// Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`.
+enum ChatCompletionAudioVoice {
+  @JsonValue('alloy')
+  alloy,
+  @JsonValue('echo')
+  echo,
+  @JsonValue('fable')
+  fable,
+  @JsonValue('onyx')
+  onyx,
+  @JsonValue('nova')
+  nova,
+  @JsonValue('shimmer')
+  shimmer,
+}
diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_message.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_message.dart
@@ -72,6 +72,10 @@ sealed class ChatCompletionMessage with _$ChatCompletionMessage {
     /// Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.
     @JsonKey(name: 'function_call', includeIfNull: false)
     ChatCompletionMessageFunctionCall? functionCall,
+
+    /// If the audio output modality is requested, this object contains data about the audio response from the model.
+    /// [Learn more](https://platform.openai.com/docs/guides/audio).
+    @JsonKey(includeIfNull: false) ChatCompletionAssistantMessageAudio? audio,
   }) = ChatCompletionAssistantMessage;
 
   // ------------------------------------------
@@ -138,7 +142,9 @@ sealed class ChatCompletionUserMessageContent
     with _$ChatCompletionUserMessageContent {
   const ChatCompletionUserMessageContent._();
 
-  /// An array of content parts with a defined type, each can be of type `text` or `image_url` when passing in images. You can pass multiple images by adding multiple `image_url` content parts. Image input is only supported when using the `gpt-4o` model.
+  /// An array of content parts with a defined type. Supported options differ based on the
+  /// [model](https://platform.openai.com/docs/models)
+  /// being used to generate the response. Can contain text, image, or audio inputs.
   const factory ChatCompletionUserMessageContent.parts(
     List<ChatCompletionMessageContentPart> value,
   ) = ChatCompletionMessageContentParts;
@@ -183,3 +189,59 @@ class _ChatCompletionUserMessageContentConverter
     };
   }
 }
+
+// ==========================================
+// CLASS: ChatCompletionAssistantMessageAudio
+// ==========================================
+
+/// If the audio output modality is requested, this object contains data about the audio response from the model.
+/// [Learn more](https://platform.openai.com/docs/guides/audio).
+@freezed
+class ChatCompletionAssistantMessageAudio
+    with _$ChatCompletionAssistantMessageAudio {
+  const ChatCompletionAssistantMessageAudio._();
+
+  /// Factory constructor for ChatCompletionAssistantMessageAudio
+  const factory ChatCompletionAssistantMessageAudio({
+    /// Unique identifier for this audio response.
+    required String id,
+
+    /// The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server
+    /// for use in multi-turn conversations.
+    @JsonKey(name: 'expires_at') required int expiresAt,
+
+    /// Base64 encoded audio bytes generated by the model, in the format specified in the request.
+    required String data,
+
+    /// Transcript of the audio generated by the model.
+    required String transcript,
+  }) = _ChatCompletionAssistantMessageAudio;
+
+  /// Object construction from a JSON representation
+  factory ChatCompletionAssistantMessageAudio.fromJson(
+          Map<String, dynamic> json) =>
+      _$ChatCompletionAssistantMessageAudioFromJson(json);
+
+  /// List of all property names of schema
+  static const List<String> propertyNames = [
+    'id',
+    'expires_at',
+    'data',
+    'transcript'
+  ];
+
+  /// Perform validations on the schema property values
+  String? validateSchema() {
+    return null;
+  }
+
+  /// Map representation of object (not serialized)
+  Map<String, dynamic> toMap() {
+    return {
+      'id': id,
+      'expires_at': expiresAt,
+      'data': data,
+      'transcript': transcript,
+    };
+  }
+}
diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_message_content_part.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_message_content_part.dart
@@ -42,6 +42,22 @@ sealed class ChatCompletionMessageContentPart
     @JsonKey(name: 'image_url') required ChatCompletionMessageImageUrl imageUrl,
   }) = ChatCompletionMessageContentPartImage;
 
+  // ------------------------------------------
+  // UNION: ChatCompletionMessageContentPartAudio
+  // ------------------------------------------
+
+  /// An audio content part of a user message.
+  /// Learn about [audio inputs](https://platform.openai.com/docs/guides/audio).
+  const factory ChatCompletionMessageContentPart.audio({
+    /// The type of the content part. Always `input_audio`.
+    @Default(ChatCompletionMessageContentPartType.inputAudio)
+    ChatCompletionMessageContentPartType type,
+
+    /// The audio input.
+    @JsonKey(name: 'input_audio')
+    required ChatCompletionMessageInputAudio inputAudio,
+  }) = ChatCompletionMessageContentPartAudio;
+
   // ------------------------------------------
   // UNION: ChatCompletionMessageContentPartRefusal
   // ------------------------------------------
@@ -71,6 +87,8 @@ enum ChatCompletionMessageContentPartEnumType {
   text,
   @JsonValue('image_url')
   imageUrl,
+  @JsonValue('input_audio')
+  inputAudio,
   @JsonValue('refusal')
   refusal,
 }
@@ -128,3 +146,54 @@ enum ChatCompletionMessageImageDetail {
   @JsonValue('high')
   high,
 }
+
+// ==========================================
+// CLASS: ChatCompletionMessageInputAudio
+// ==========================================
+
+/// The audio input.
+@freezed
+class ChatCompletionMessageInputAudio with _$ChatCompletionMessageInputAudio {
+  const ChatCompletionMessageInputAudio._();
+
+  /// Factory constructor for ChatCompletionMessageInputAudio
+  const factory ChatCompletionMessageInputAudio({
+    /// Base64 encoded audio data.
+    required String data,
+
+    /// The format of the encoded audio data. Currently supports "wav" and "mp3".
+    required ChatCompletionMessageInputAudioFormat format,
+  }) = _ChatCompletionMessageInputAudio;
+
+  /// Object construction from a JSON representation
+  factory ChatCompletionMessageInputAudio.fromJson(Map<String, dynamic> json) =>
+      _$ChatCompletionMessageInputAudioFromJson(json);
+
+  /// List of all property names of schema
+  static const List<String> propertyNames = ['data', 'format'];
+
+  /// Perform validations on the schema property values
+  String? validateSchema() {
+    return null;
+  }
+
+  /// Map representation of object (not serialized)
+  Map<String, dynamic> toMap() {
+    return {
+      'data': data,
+      'format': format,
+    };
+  }
+}
+
+// ==========================================
+// ENUM: ChatCompletionMessageInputAudioFormat
+// ==========================================
+
+/// The format of the encoded audio data. Currently supports "wav" and "mp3".
+enum ChatCompletionMessageInputAudioFormat {
+  @JsonValue('wav')
+  wav,
+  @JsonValue('mp3')
+  mp3,
+}
diff --git a/packages/openai_dart/lib/src/generated/schema/chat_completion_message_content_part_type.dart b/packages/openai_dart/lib/src/generated/schema/chat_completion_message_content_part_type.dart
@@ -14,6 +14,8 @@ enum ChatCompletionMessageContentPartType {
   text,
   @JsonValue('image_url')
   imageUrl,
+  @JsonValue('input_audio')
+  inputAudio,
   @JsonValue('refusal')
   refusal,
 }