Skip to content

Commit

Permalink
feat: Add support for audio in chat completions in openai_dart (#577)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmigloz authored Oct 21, 2024
1 parent 45b9f42 commit 0fb058c
Show file tree
Hide file tree
Showing 38 changed files with 21,565 additions and 14,753 deletions.
76 changes: 75 additions & 1 deletion packages/openai_dart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Unofficial Dart client for [OpenAI](https://platform.openai.com/docs/api-referen

**Supported endpoints:**

- Chat (with structured outputs, tools and streaming support)
- Chat (with text, image, audio, structured outputs, tools and streaming support)
- Completions (legacy)
- Embeddings
- Fine-tuning
Expand Down Expand Up @@ -223,6 +223,80 @@ ChatCompletionMessage.user(
//...
```

In addition to generating text and images, some models enable you to generate a spoken audio response to a prompt:

```dart
final res = await client.createChatCompletion(
request: CreateChatCompletionRequest(
model: ChatCompletionModel.model(
ChatCompletionModels.gpt4oAudioPreview,
),
modalities: [
ChatCompletionModality.text,
ChatCompletionModality.audio,
],
audio: ChatCompletionAudioOptions(
voice: ChatCompletionAudioVoice.alloy,
format: ChatCompletionAudioFormat.wav,
),
messages: [
ChatCompletionMessage.user(
content: ChatCompletionUserMessageContent.string(
'Is a golden retriever a good family dog?',
),
),
],
),
);
final choice = res.choices.first;
final audio = choice.message.audio;
print(audio?.id);
print(audio?.expiresAt);
print(audio?.transcript);
print(audio?.data);
```

And to use audio inputs to prompt the model:

```dart
final res = await client.createChatCompletion(
request: CreateChatCompletionRequest(
model: ChatCompletionModel.model(
ChatCompletionModels.gpt4oAudioPreview,
),
modalities: [
ChatCompletionModality.text,
ChatCompletionModality.audio,
],
audio: ChatCompletionAudioOptions(
voice: ChatCompletionAudioVoice.alloy,
format: ChatCompletionAudioFormat.wav,
),
messages: [
ChatCompletionMessage.user(
content: ChatCompletionUserMessageContent.parts([
ChatCompletionMessageContentPart.text(
text: 'Do what the recording says',
),
ChatCompletionMessageContentPart.audio(
inputAudio: ChatCompletionMessageInputAudio(
data: 'UklGRoYZAQBXQVZFZm10I...//X//v8FAOj/GAD+/7z/',
format: ChatCompletionMessageInputAudioFormat.wav,
),
),
]),
),
],
);
);
final choice = res.choices.first;
final audio = choice.message.audio;
print(audio?.id);
print(audio?.expiresAt);
print(audio?.transcript);
print(audio?.data);
```

**Structured output: ([docs](https://platform.openai.com/docs/guides/structured-outputs))**

Structured Outputs is a feature that ensures the model will always generate responses that adhere to your supplied JSON Schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,15 @@ class AssistantObject with _$AssistantObject {
/// types `code_interpreter`, `file_search`, or `function`.
required List<AssistantTools> tools,

/// A set of resources that are made available to the assistant's tools in this thread. The resources are specific to the type of tool. For example, the `code_interpreter` tool requires a list of file IDs, while the `file_search` tool requires a list of vector store IDs.
/// A set of resources that are made available to the assistant's tools in this thread. The resources are specific
/// to the type of tool. For example, the `code_interpreter` tool requires a list of file IDs, while the
/// `file_search` tool requires a list of vector store IDs.
@JsonKey(name: 'tool_resources', includeIfNull: false)
ToolResources? toolResources,

/// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional
/// information about the object in a structured format. Keys can be a maximum of 64 characters long and values
/// can be a maxium of 512 characters long.
/// can be a maximum of 512 characters long.
required Map<String, dynamic>? metadata,

/// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random,
Expand Down
2 changes: 1 addition & 1 deletion packages/openai_dart/lib/src/generated/schema/batch.dart
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class Batch with _$Batch {

/// Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional
/// information about the object in a structured format. Keys can be a maximum of 64 characters long and values
/// can be a maxium of 512 characters long.
/// can be a maximum of 512 characters long.
@JsonKey(includeIfNull: false) dynamic metadata,
}) = _Batch;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// coverage:ignore-file
// GENERATED CODE - DO NOT MODIFY BY HAND
// ignore_for_file: type=lint
// ignore_for_file: invalid_annotation_target
part of open_a_i_schema;

// ==========================================
// ENUM: ChatCompletionAudioFormat
// ==========================================

/// Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
enum ChatCompletionAudioFormat {
@JsonValue('wav')
wav,
@JsonValue('mp3')
mp3,
@JsonValue('flac')
flac,
@JsonValue('opus')
opus,
@JsonValue('pcm16')
pcm16,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// coverage:ignore-file
// GENERATED CODE - DO NOT MODIFY BY HAND
// ignore_for_file: type=lint
// ignore_for_file: invalid_annotation_target
part of open_a_i_schema;

// ==========================================
// CLASS: ChatCompletionAudioOptions
// ==========================================

/// Parameters for audio output. Required when audio output is requested with `modalities: ["audio"]`.
/// [Learn more](https://platform.openai.com/docs/guides/audio).
@freezed
class ChatCompletionAudioOptions with _$ChatCompletionAudioOptions {
const ChatCompletionAudioOptions._();

/// Factory constructor for ChatCompletionAudioOptions
const factory ChatCompletionAudioOptions({
/// Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`.
required ChatCompletionAudioVoice voice,

/// Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
required ChatCompletionAudioFormat format,
}) = _ChatCompletionAudioOptions;

/// Object construction from a JSON representation
factory ChatCompletionAudioOptions.fromJson(Map<String, dynamic> json) =>
_$ChatCompletionAudioOptionsFromJson(json);

/// List of all property names of schema
static const List<String> propertyNames = ['voice', 'format'];

/// Perform validations on the schema property values
String? validateSchema() {
return null;
}

/// Map representation of object (not serialized)
Map<String, dynamic> toMap() {
return {
'voice': voice,
'format': format,
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// coverage:ignore-file
// GENERATED CODE - DO NOT MODIFY BY HAND
// ignore_for_file: type=lint
// ignore_for_file: invalid_annotation_target
part of open_a_i_schema;

// ==========================================
// ENUM: ChatCompletionAudioVoice
// ==========================================

/// Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`.
enum ChatCompletionAudioVoice {
@JsonValue('alloy')
alloy,
@JsonValue('echo')
echo,
@JsonValue('fable')
fable,
@JsonValue('onyx')
onyx,
@JsonValue('nova')
nova,
@JsonValue('shimmer')
shimmer,
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ sealed class ChatCompletionMessage with _$ChatCompletionMessage {
/// Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.
@JsonKey(name: 'function_call', includeIfNull: false)
ChatCompletionMessageFunctionCall? functionCall,

/// If the audio output modality is requested, this object contains data about the audio response from the model.
/// [Learn more](https://platform.openai.com/docs/guides/audio).
@JsonKey(includeIfNull: false) ChatCompletionAssistantMessageAudio? audio,
}) = ChatCompletionAssistantMessage;

// ------------------------------------------
Expand Down Expand Up @@ -138,7 +142,9 @@ sealed class ChatCompletionUserMessageContent
with _$ChatCompletionUserMessageContent {
const ChatCompletionUserMessageContent._();

/// An array of content parts with a defined type, each can be of type `text` or `image_url` when passing in images. You can pass multiple images by adding multiple `image_url` content parts. Image input is only supported when using the `gpt-4o` model.
/// An array of content parts with a defined type. Supported options differ based on the
/// [model](https://platform.openai.com/docs/models)
/// being used to generate the response. Can contain text, image, or audio inputs.
const factory ChatCompletionUserMessageContent.parts(
List<ChatCompletionMessageContentPart> value,
) = ChatCompletionMessageContentParts;
Expand Down Expand Up @@ -183,3 +189,59 @@ class _ChatCompletionUserMessageContentConverter
};
}
}

// ==========================================
// CLASS: ChatCompletionAssistantMessageAudio
// ==========================================

/// If the audio output modality is requested, this object contains data about the audio response from the model.
/// [Learn more](https://platform.openai.com/docs/guides/audio).
@freezed
class ChatCompletionAssistantMessageAudio
with _$ChatCompletionAssistantMessageAudio {
const ChatCompletionAssistantMessageAudio._();

/// Factory constructor for ChatCompletionAssistantMessageAudio
const factory ChatCompletionAssistantMessageAudio({
/// Unique identifier for this audio response.
required String id,

/// The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server
/// for use in multi-turn conversations.
@JsonKey(name: 'expires_at') required int expiresAt,

/// Base64 encoded audio bytes generated by the model, in the format specified in the request.
required String data,

/// Transcript of the audio generated by the model.
required String transcript,
}) = _ChatCompletionAssistantMessageAudio;

/// Object construction from a JSON representation
factory ChatCompletionAssistantMessageAudio.fromJson(
Map<String, dynamic> json) =>
_$ChatCompletionAssistantMessageAudioFromJson(json);

/// List of all property names of schema
static const List<String> propertyNames = [
'id',
'expires_at',
'data',
'transcript'
];

/// Perform validations on the schema property values
String? validateSchema() {
return null;
}

/// Map representation of object (not serialized)
Map<String, dynamic> toMap() {
return {
'id': id,
'expires_at': expiresAt,
'data': data,
'transcript': transcript,
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ sealed class ChatCompletionMessageContentPart
@JsonKey(name: 'image_url') required ChatCompletionMessageImageUrl imageUrl,
}) = ChatCompletionMessageContentPartImage;

// ------------------------------------------
// UNION: ChatCompletionMessageContentPartAudio
// ------------------------------------------

/// An audio content part of a user message.
/// Learn about [audio inputs](https://platform.openai.com/docs/guides/audio).
const factory ChatCompletionMessageContentPart.audio({
/// The type of the content part. Always `input_audio`.
@Default(ChatCompletionMessageContentPartType.inputAudio)
ChatCompletionMessageContentPartType type,

/// The audio input.
@JsonKey(name: 'input_audio')
required ChatCompletionMessageInputAudio inputAudio,
}) = ChatCompletionMessageContentPartAudio;

// ------------------------------------------
// UNION: ChatCompletionMessageContentPartRefusal
// ------------------------------------------
Expand Down Expand Up @@ -71,6 +87,8 @@ enum ChatCompletionMessageContentPartEnumType {
text,
@JsonValue('image_url')
imageUrl,
@JsonValue('input_audio')
inputAudio,
@JsonValue('refusal')
refusal,
}
Expand Down Expand Up @@ -128,3 +146,54 @@ enum ChatCompletionMessageImageDetail {
@JsonValue('high')
high,
}

// ==========================================
// CLASS: ChatCompletionMessageInputAudio
// ==========================================

/// The audio input.
@freezed
class ChatCompletionMessageInputAudio with _$ChatCompletionMessageInputAudio {
const ChatCompletionMessageInputAudio._();

/// Factory constructor for ChatCompletionMessageInputAudio
const factory ChatCompletionMessageInputAudio({
/// Base64 encoded audio data.
required String data,

/// The format of the encoded audio data. Currently supports "wav" and "mp3".
required ChatCompletionMessageInputAudioFormat format,
}) = _ChatCompletionMessageInputAudio;

/// Object construction from a JSON representation
factory ChatCompletionMessageInputAudio.fromJson(Map<String, dynamic> json) =>
_$ChatCompletionMessageInputAudioFromJson(json);

/// List of all property names of schema
static const List<String> propertyNames = ['data', 'format'];

/// Perform validations on the schema property values
String? validateSchema() {
return null;
}

/// Map representation of object (not serialized)
Map<String, dynamic> toMap() {
return {
'data': data,
'format': format,
};
}
}

// ==========================================
// ENUM: ChatCompletionMessageInputAudioFormat
// ==========================================

/// The format of the encoded audio data. Currently supports "wav" and "mp3".
enum ChatCompletionMessageInputAudioFormat {
@JsonValue('wav')
wav,
@JsonValue('mp3')
mp3,
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ enum ChatCompletionMessageContentPartType {
text,
@JsonValue('image_url')
imageUrl,
@JsonValue('input_audio')
inputAudio,
@JsonValue('refusal')
refusal,
}
Loading

0 comments on commit 0fb058c

Please sign in to comment.