From 1f5578136027da944b70543dd0588e60b52553e5 Mon Sep 17 00:00:00 2001
From: mimi <aiaimimi0920@gmail.com>
Date: Tue, 16 Jan 2024 09:23:48 +0800
Subject: [PATCH 1/3] # fix missing text

---
 .../godot_whisper/capture_stream_to_text.gd   | 14 ++++++--
 src/speech_to_text.cpp                        | 32 ++++++++++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd
index d8759d98..4e59f38d 100644
--- a/bin/addons/godot_whisper/capture_stream_to_text.gd
+++ b/bin/addons/godot_whisper/capture_stream_to_text.gd
@@ -129,7 +129,7 @@ func _add_timer():
 	var timer_node = Timer.new()
 	timer_node.one_shot = false
 	timer_node.autostart = true
-	timer_node.wait_time = 1
+	timer_node.wait_time = 0.5
 	add_child(timer_node)
 	timer_node.connect("timeout",self._on_timer_timeout)
 
@@ -140,7 +140,15 @@ func _on_timer_timeout():
 	if is_running:
 		_speech_to_text_singleton.add_audio_buffer(buffer)
 
-func _remove_special_characters(message: String):
+func _remove_special_characters(message: String, is_partial: bool):
+	if is_partial == false:
+		if message.ends_with("[_END_]"):
+			message = message.trim_suffix("[_END_]")
+		else:
+			var end_character := message.find("[_TT_")
+			if end_character != -1:
+				message = message.substr(0, end_character)
+	
 	var special_characters = [ \
 		{ "start": "[", "end": "]" }, \
 		{ "start": "<", "end": ">" }]
@@ -154,7 +162,7 @@ func _remove_special_characters(message: String):
 
 func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array):
 	for transcribed_msg  in transcribed_msgs:
-		var cur_text = _remove_special_characters(transcribed_msg["text"])
+		var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"])
 		
 		if transcribed_msg["is_partial"]==false:
 			if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."):
diff --git a/src/speech_to_text.cpp b/src/speech_to_text.cpp
index 08c0cfdb..f6159819 100644
--- a/src/speech_to_text.cpp
+++ b/src/speech_to_text.cpp
@@ -398,7 +398,7 @@ void SpeechToText::run() {
 	SpeechToText *speech_to_text_obj = SpeechToText::get_singleton();
 	whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 	// See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302
-	whisper_params.max_len = 1;
+	whisper_params.max_len = 0;
 	whisper_params.print_progress = false;
 	whisper_params.print_special = false;
 	whisper_params.print_realtime = false;
@@ -408,7 +408,7 @@ void SpeechToText::run() {
 	whisper_params.translate = speech_to_text_obj->params.translate;
 	whisper_params.single_segment = true;
 	whisper_params.no_timestamps = false;
-	whisper_params.token_timestamps = false;
+	whisper_params.token_timestamps = true;
 	whisper_params.max_tokens = speech_to_text_obj->params.max_tokens;
 	whisper_params.language = speech_to_text_obj->params.language.c_str();
 	whisper_params.n_threads = speech_to_text_obj->params.n_threads;
@@ -510,6 +510,7 @@ void SpeechToText::run() {
 		{
 			transcribed_msg msg;
 			const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance);
+			int64_t last_t = 0;
 			for (int i = 0; i < n_segments; ++i) {
 				const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i);
 				for (int j = 0; j < n_tokens; j++) {
@@ -524,6 +525,12 @@ void SpeechToText::run() {
 						//WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text);
 						//continue;
 					}
+					if (String(text).begins_with("[_TT_") && last_t == 0) {
+						if (j > 0) {
+							auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1);
+							last_t = last_token.t1;
+						}
+					}
 					msg.text += text;
 				}
 			}
@@ -545,18 +552,33 @@ void SpeechToText::run() {
 			 * Clear audio buffer when the size exceeds iteration threshold or
 			 * speech end is detected.
 			 */
+			if (speech_has_end) {
+				msg.text += "[_END_]";
+			}
+
 			if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
 				const auto t_now = Time::get_singleton()->get_ticks_msec();
 				const auto t_diff = t_now - speech_to_text_obj->t_last_iter;
 				speech_to_text_obj->t_last_iter = t_now;
-
 				msg.is_partial = false;
 				/**
 				 * Keep the last few samples in the audio buffer, so the next
 				 * iteration has a smoother start.
 				 */
-				std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
-				pcmf32 = std::move(last);
+				if (last_t == 0 || speech_has_end) {
+					std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
+					pcmf32 = std::move(last);
+				} else {
+					int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE);
+					if (target_index >= pcmf32.size()) {
+						std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
+						pcmf32 = std::move(last);
+					} else {
+						std::vector<float> last(pcmf32.begin() + target_index, pcmf32.end());
+						pcmf32 = std::move(last);
+					}
+				}
+
 			} else {
 				msg.is_partial = true;
 			}

From d98cd7c08b02dd6f0fc960bb4b72b455d056c6ba Mon Sep 17 00:00:00 2001
From: mimi <aiaimimi0920@gmail.com>
Date: Tue, 16 Jan 2024 17:04:17 +0800
Subject: [PATCH 2/3] # add split func

---
 .../godot_whisper/capture_stream_to_text.gd     | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd
index 4e59f38d..9d452c26 100644
--- a/bin/addons/godot_whisper/capture_stream_to_text.gd
+++ b/bin/addons/godot_whisper/capture_stream_to_text.gd
@@ -147,7 +147,8 @@ func _remove_special_characters(message: String, is_partial: bool):
 		else:
 			var end_character := message.find("[_TT_")
 			if end_character != -1:
-				message = message.substr(0, end_character)
+				message = message.substr(0, end_character) + "{SPLIT}" + message.substr(end_character)
+				
 	
 	var special_characters = [ \
 		{ "start": "[", "end": "]" }, \
@@ -158,18 +159,28 @@ func _remove_special_characters(message: String, is_partial: bool):
 			var end_character := message.find(special_character["end"])
 			if end_character != -1:
 				message = message.substr(0, begin_character) + message.substr(end_character + 1)
+	
+	message = message.trim_suffix("{SPLIT}")
 	return message
 
 func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array):
 	for transcribed_msg  in transcribed_msgs:
 		var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"])
-		
 		if transcribed_msg["is_partial"]==false:
 			if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."):
 				pass
 			else:
 				cur_text = cur_text + "."
-		update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)
+				
+		if transcribed_msg["is_partial"]==false:
+			var split_character := cur_text.find("{SPLIT}")
+			if split_character!=-1:
+				update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text.substr(0, split_character), process_time_ms)
+				update_transcribed_msg.emit(_last_index+1, true, cur_text.substr(split_character+7), process_time_ms)
+			else:
+				update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)
+		else:
+			update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)
 		if transcribed_msg["is_partial"]==false:
 			_last_index+=1
 

From 55e1e0d4754681e7d8cbefd46adecdb05304461b Mon Sep 17 00:00:00 2001
From: aiaimimi0920 <153103332+aiaimimi0920@users.noreply.github.com>
Date: Tue, 16 Jan 2024 17:30:43 +0800
Subject: [PATCH 3/3] Update capture_stream_to_text.gd change to 1.0

---
 bin/addons/godot_whisper/capture_stream_to_text.gd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd
index 9d452c26..54de71d1 100644
--- a/bin/addons/godot_whisper/capture_stream_to_text.gd
+++ b/bin/addons/godot_whisper/capture_stream_to_text.gd
@@ -129,7 +129,7 @@ func _add_timer():
 	var timer_node = Timer.new()
 	timer_node.one_shot = false
 	timer_node.autostart = true
-	timer_node.wait_time = 0.5
+	timer_node.wait_time = 1.0
 	add_child(timer_node)
 	timer_node.connect("timeout",self._on_timer_timeout)