diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd index d8759d98..54de71d1 100644 --- a/bin/addons/godot_whisper/capture_stream_to_text.gd +++ b/bin/addons/godot_whisper/capture_stream_to_text.gd @@ -129,7 +129,7 @@ func _add_timer(): var timer_node = Timer.new() timer_node.one_shot = false timer_node.autostart = true - timer_node.wait_time = 1 + timer_node.wait_time = 1.0 add_child(timer_node) timer_node.connect("timeout",self._on_timer_timeout) @@ -140,7 +140,16 @@ func _on_timer_timeout(): if is_running: _speech_to_text_singleton.add_audio_buffer(buffer) -func _remove_special_characters(message: String): +func _remove_special_characters(message: String, is_partial: bool): + if is_partial == false: + if message.ends_with("[_END_]"): + message = message.trim_suffix("[_END_]") + else: + var end_character := message.find("[_TT_") + if end_character != -1: + message = message.substr(0, end_character) + "{SPLIT}" + message.substr(end_character) + + var special_characters = [ \ { "start": "[", "end": "]" }, \ { "start": "<", "end": ">" }] @@ -150,18 +159,28 @@ func _remove_special_characters(message: String): var end_character := message.find(special_character["end"]) if end_character != -1: message = message.substr(0, begin_character) + message.substr(end_character + 1) + + message = message.trim_suffix("{SPLIT}") return message func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array): for transcribed_msg in transcribed_msgs: - var cur_text = _remove_special_characters(transcribed_msg["text"]) - + var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"]) if transcribed_msg["is_partial"]==false: if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."): pass else: cur_text = cur_text + "." - update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) + + if transcribed_msg["is_partial"]==false: + var split_character := cur_text.find("{SPLIT}") + if split_character!=-1: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text.substr(0, split_character), process_time_ms) + update_transcribed_msg.emit(_last_index+1, true, cur_text.substr(split_character+7), process_time_ms) + else: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) + else: + update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms) if transcribed_msg["is_partial"]==false: _last_index+=1 diff --git a/src/speech_to_text.cpp b/src/speech_to_text.cpp index 08c0cfdb..f6159819 100644 --- a/src/speech_to_text.cpp +++ b/src/speech_to_text.cpp @@ -398,7 +398,7 @@ void SpeechToText::run() { SpeechToText *speech_to_text_obj = SpeechToText::get_singleton(); whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); // See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302 - whisper_params.max_len = 1; + whisper_params.max_len = 0; whisper_params.print_progress = false; whisper_params.print_special = false; whisper_params.print_realtime = false; @@ -408,7 +408,7 @@ void SpeechToText::run() { whisper_params.translate = speech_to_text_obj->params.translate; whisper_params.single_segment = true; whisper_params.no_timestamps = false; - whisper_params.token_timestamps = false; + whisper_params.token_timestamps = true; whisper_params.max_tokens = speech_to_text_obj->params.max_tokens; whisper_params.language = speech_to_text_obj->params.language.c_str(); whisper_params.n_threads = speech_to_text_obj->params.n_threads; @@ -510,6 +510,7 @@ void SpeechToText::run() { { transcribed_msg msg; const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance); + int64_t last_t = 0; for (int i = 0; i < n_segments; ++i) { const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i); for (int j = 0; j < n_tokens; j++) { @@ -524,6 +525,12 @@ void SpeechToText::run() { //WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text); //continue; } + if (String(text).begins_with("[_TT_") && last_t == 0) { + if (j > 0) { + auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1); + last_t = last_token.t1; + } + } msg.text += text; } } @@ -545,18 +552,33 @@ void SpeechToText::run() { * Clear audio buffer when the size exceeds iteration threshold or * speech end is detected. */ + if (speech_has_end) { + msg.text += "[_END_]"; + } + if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) { const auto t_now = Time::get_singleton()->get_ticks_msec(); const auto t_diff = t_now - speech_to_text_obj->t_last_iter; speech_to_text_obj->t_last_iter = t_now; - msg.is_partial = false; /** * Keep the last few samples in the audio buffer, so the next * iteration has a smoother start. */ - std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); - pcmf32 = std::move(last); + if (last_t == 0 || speech_has_end) { + std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); + pcmf32 = std::move(last); + } else { + int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE); + if (target_index >= pcmf32.size()) { + std::vector last(pcmf32.end() - n_samples_keep_iter, pcmf32.end()); + pcmf32 = std::move(last); + } else { + std::vector last(pcmf32.begin() + target_index, pcmf32.end()); + pcmf32 = std::move(last); + } + } + } else { msg.is_partial = true; }