fixie-ai · zqhuang211 · Aug 27, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py
@@ -41,11 +41,11 @@
     # from https://arxiv.org/pdf/2402.08846
     "Transcribe speech to text: <|audio|>",
     # from GPT-4
-    "Capture every word from <|audio|> verbatim",
-    "Convert speech to text from <|audio|>",
-    "Listen and transcribe the complete text from <|audio|>",
-    "Record in writing what is spoken in <|audio|>",
-    "Transcribe the spoken words from <|audio|> with exact wording and punctuation",
+    "Capture every word from the audio verbatim\n<|audio|>",
+    "Convert speech to text from audio\n<|audio|>",
+    "Listen and transcribe the complete text from audio\n<|audio|>",
+    "Record in writing what is spoken in audio\n<|audio|>",
+    "Transcribe the spoken words from audio with exact wording and punctuation\n<|audio|>",
 ]
 ANSWER_PROMPTS = [
     # from Gazelle

diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml
@@ -1,5 +1,5 @@
 # SLM with ultravox & llama3.1, trained wtih knowledge distillation.
-exp_name: "ultravox-v0_3"
+exp_name: "ultravox-v0_4"
 
 # Make sure to accept the license agreement on huggingface hub
 text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
@@ -14,10 +14,11 @@ loss_config:
 val_sets: ["anyinstruct", "soda", "peoplespeech"]
 
 batch_size: 24
-max_steps: 7200 # x8x24 = 1,382,400 samples
+max_steps: 14400 # x8x24 = 2,764,800
 
-data_sets: []
+data_sets: ["anyinstruct"]
 data_dicts:
+# continuation
   - path: "fixie-ai/librispeech_asr"
     name: "clean"
     splits:
@@ -35,6 +36,14 @@ data_dicts:
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text }}"
     weight: 1
+  - path: "fixie-ai/peoples_speech"
+    name: "clean"
+    splits:
+      - "train" # 1_501_271 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text_proc.format_asr_text(text) }}"
+    weight: 8
   - path: "fixie-ai/common_voice_17_0"
     name: "en"
     splits:
@@ -43,3 +52,165 @@ data_dicts:
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
     weight: 8
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ar"
+    splits:
+      - "train" # 28_369 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.2
+  - path: "fixie-ai/common_voice_17_0"
+    name: "de"
+    splits:
+      - "train" # 589_100 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 4
+  - path: "fixie-ai/common_voice_17_0"
+    name: "es"
+    splits:
+      - "train" # 336_846 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 3
+  - path: "fixie-ai/common_voice_17_0"
+    name: "fr"
+    splits:
+      - "train" # 558_054 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 4
+  - path: "fixie-ai/common_voice_17_0"
+    name: "it"
+    splits:
+      - "train" # 169_771 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 1.2
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ja"
+    splits:
+      - "train" # 10_039 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.1
+  - path: "fixie-ai/common_voice_17_0"
+    name: "pt"
+    splits:
+      - "train" # 21_968 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.2
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ru"
+    splits:
+      - "train" # 26_377 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.2
+# ASR task
+  - path: "fixie-ai/librispeech_asr"
+    name: "clean"
+    splits:
+      - "train.100" # 28_539 samples
+      - "train.360" # 104_014 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text }}"
+    transcript_template: "{{ text }}"
+    weight: 0.1
+  - path: "fixie-ai/librispeech_asr"
+    name: "other"
+    splits:
+      - "train.500" # 148_688 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text }}"
+    transcript_template: "{{ text }}"
+    weight: 0.1
+  - path: "fixie-ai/peoples_speech"
+    name: "clean"
+    splits:
+      - "train" # 1_501_271 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(text) }}"
+    transcript_template: "{{ text_proc.format_asr_text(text) }}"
+    weight: 0.8
+  - path: "fixie-ai/common_voice_17_0"
+    name: "en"
+    splits:
+      - "train" # 1_101_170 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
+    weight: 0.8
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ar"
+    splits:
+      - "train" # 28_369 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.02
+  - path: "fixie-ai/common_voice_17_0"
+    name: "de"
+    splits:
+      - "train" # 589_100 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.4
+  - path: "fixie-ai/common_voice_17_0"
+    name: "es"
+    splits:
+      - "train" # 336_846 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.3
+  - path: "fixie-ai/common_voice_17_0"
+    name: "fr"
+    splits:
+      - "train" # 558_054 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.4
+  - path: "fixie-ai/common_voice_17_0"
+    name: "it"
+    splits:
+      - "train" # 169_771 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.12
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ja"
+    splits:
+      - "train" # 10_039 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.01
+  - path: "fixie-ai/common_voice_17_0"
+    name: "pt"
+    splits:
+      - "train" # 21_968 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.02
+  - path: "fixie-ai/common_voice_17_0"
+    name: "ru"
+    splits:
+      - "train" # 26_377 samples
+    user_template: "{{ dataset._get_transcribe_prompt() }}"
+    assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
+    transcript_template: "{{ sentence }}"
+    weight: 0.02