From 1f5578136027da944b70543dd0588e60b52553e5 Mon Sep 17 00:00:00 2001 From: mimi Date: Tue, 16 Jan 2024 09:23:48 +0800 Subject: [PATCH 1/3] # fix missing text --- .../godot_whisper/capture_stream_to_text.gd | 14 ++++++-- src/speech_to_text.cpp | 32 ++++++++++++++++--- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd index d8759d98..4e59f38d 100644 --- a/bin/addons/godot_whisper/capture_stream_to_text.gd +++ b/bin/addons/godot_whisper/capture_stream_to_text.gd @@ -129,7 +129,7 @@ func _add_timer(): var timer_node = Timer.new() timer_node.one_shot = false timer_node.autostart = true - timer_node.wait_time = 1 + timer_node.wait_time = 0.5 add_child(timer_node) timer_node.connect("timeout",self._on_timer_timeout) @@ -140,7 +140,15 @@ func _on_timer_timeout(): if is_running: _speech_to_text_singleton.add_audio_buffer(buffer) -func _remove_special_characters(message: String): +func _remove_special_characters(message: String, is_partial: bool): + if is_partial == false: + if message.ends_with("[_END_]"): + message = message.trim_suffix("[_END_]") + else: + var end_character := message.find("[_TT_") + if end_character != -1: + message = message.substr(0, end_character) + var special_characters = [ \ { "start": "[", "end": "]" }, \ { "start": "<", "end": ">" }] @@ -154,7 +162,7 @@ func _remove_special_characters(message: String): func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array): for transcribed_msg in transcribed_msgs: - var cur_text = _remove_special_characters(transcribed_msg["text"]) + var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"]) if transcribed_msg["is_partial"]==false: if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."): diff --git a/src/speech_to_text.cpp b/src/speech_to_text.cpp index 08c0cfdb..f6159819 100644 --- a/src/speech_to_text.cpp +++ b/src/speech_to_text.cpp @@ -398,7 +398,7 @@ void SpeechToText::run() { SpeechToText *speech_to_text_obj = SpeechToText::get_singleton(); whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); // See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302 - whisper_params.max_len = 1; + whisper_params.max_len = 0; whisper_params.print_progress = false; whisper_params.print_special = false; whisper_params.print_realtime = false; @@ -408,7 +408,7 @@ void SpeechToText::run() { whisper_params.translate = speech_to_text_obj->params.translate; whisper_params.single_segment = true; whisper_params.no_timestamps = false; - whisper_params.token_timestamps = false; + whisper_params.token_timestamps = true; whisper_params.max_tokens = speech_to_text_obj->params.max_tokens; whisper_params.language = speech_to_text_obj->params.language.c_str(); whisper_params.n_threads = speech_to_text_obj->params.n_threads; @@ -510,6 +510,7 @@ void SpeechToText::run() { { transcribed_msg msg; const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance); + int64_t last_t = 0; for (int i = 0; i < n_segments; ++i) { const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i); for (int j = 0; j < n_tokens; j++) { @@ -524,6 +525,12 @@ void SpeechToText::run() { //WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text); //continue; } + if (String(text).begins_with("[_TT_") && last_t == 0) { + if (j > 0) { + auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1); + last_t = last_token.t1; + } + } msg.text += text; } } @@ -545,18 +552,33 @@ void SpeechToText::run() { * Clear audio buffer when the size exceeds iteration threshold or * speech end is detected. */ + if (speech_has_end) { + msg.text += "[_END_]"; + } + if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) { const auto t_now = Time::get_singleton()->get_ticks_msec(); const auto t_diff = t_now - speech_to_text_obj->t_last_iter; speech_to_text_obj->t_last_iter = t_now; - msg.is_partial = false; /** * Keep the last few samples in the audio buffer, so the next * iteration has a smoother start. */ - std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); - pcmf32 = std::move(last); + if (last_t == 0 || speech_has_end) { + std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); + pcmf32 = std::move(last); + } else { + int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE); + if (target_index >= pcmf32.size()) { + std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); + pcmf32 = std::move(last); + } else { + std::vector last(pcmf32.begin() + target_index, pcmf32.end()); + pcmf32 = std::move(last); + } + } + } else { msg.is_partial = true; } From d98cd7c08b02dd6f0fc960bb4b72b455d056c6ba Mon Sep 17 00:00:00 2001 From: mimi Date: Tue, 16 Jan 2024 17:04:17 +0800 Subject: [PATCH 2/3] # add split func --- .../godot_whisper/capture_stream_to_text.gd | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd index 4e59f38d..9d452c26 100644 --- a/bin/addons/godot_whisper/capture_stream_to_text.gd +++ b/bin/addons/godot_whisper/capture_stream_to_text.gd @@ -147,7 +147,8 @@ func _remove_special_characters(message: String, is_partial: bool): else: var end_character := message.find("[_TT_") if end_character != -1: - message = message.substr(0, end_character) + message = message.substr(0, end_character) + "{SPLIT}" + message.substr(end_character) + var special_characters = [ \ { "start": "[", "end": "]" }, \ @@ -158,18 +159,28 @@ func _remove_special_characters(message: String, is_partial: bool): var end_character := message.find(special_character["end"]) if end_character != -1: message = message.substr(0, begin_character) + message.substr(end_character + 1) + + message = message.trim_suffix("{SPLIT}") return message func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array): for transcribed_msg in transcribed_msgs: var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"]) - if transcribed_msg["is_partial"]==false: if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."): pass else: cur_text = cur_text + "." - update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) + + if transcribed_msg["is_partial"]==false: + var split_character := cur_text.find("{SPLIT}") + if split_character!=-1: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text.substr(0, split_character), process_time_ms) + update_transcribed_msg.emit(_last_index+1, true, cur_text.substr(split_character+7), process_time_ms) + else: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) + else: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) if transcribed_msg["is_partial"]==false: _last_index+=1 From 55e1e0d4754681e7d8cbefd46adecdb05304461b Mon Sep 17 00:00:00 2001 From: aiaimimi0920 <153103332+aiaimimi0920@users.noreply.github.com> Date: Tue, 16 Jan 2024 17:30:43 +0800 Subject: [PATCH 3/3] Update capture_stream_to_text.gd change to 1.0 --- bin/addons/godot_whisper/capture_stream_to_text.gd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd index 9d452c26..54de71d1 100644 --- a/bin/addons/godot_whisper/capture_stream_to_text.gd +++ b/bin/addons/godot_whisper/capture_stream_to_text.gd @@ -129,7 +129,7 @@ func _add_timer(): var timer_node = Timer.new() timer_node.one_shot = false timer_node.autostart = true - timer_node.wait_time = 0.5 + timer_node.wait_time = 1.0 add_child(timer_node) timer_node.connect("timeout",self._on_timer_timeout)