V-Sekai · Ughuuu · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/bin/addons/godot_whisper/capture_stream_to_text.gd b/bin/addons/godot_whisper/capture_stream_to_text.gd
@@ -129,7 +129,7 @@ func _add_timer():
 	var timer_node = Timer.new()
 	timer_node.one_shot = false
 	timer_node.autostart = true
-	timer_node.wait_time = 1
+	timer_node.wait_time = 0.5
 	add_child(timer_node)
 	timer_node.connect("timeout",self._on_timer_timeout)
 
@@ -140,7 +140,15 @@ func _on_timer_timeout():
 	if is_running:
 		_speech_to_text_singleton.add_audio_buffer(buffer)
 
-func _remove_special_characters(message: String):
+func _remove_special_characters(message: String, is_partial: bool):
+	if is_partial == false:
+		if message.ends_with("[_END_]"):
+			message = message.trim_suffix("[_END_]")
+		else:
+			var end_character := message.find("[_TT_")
+			if end_character != -1:
+				message = message.substr(0, end_character)
+
 	var special_characters = [ \
 		{ "start": "[", "end": "]" }, \
 		{ "start": "<", "end": ">" }]
@@ -154,7 +162,7 @@ func _remove_special_characters(message: String):
 
 func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array):
 	for transcribed_msg  in transcribed_msgs:
-		var cur_text = _remove_special_characters(transcribed_msg["text"])
+		var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"])
 
 		if transcribed_msg["is_partial"]==false:
 			if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."):

diff --git a/src/speech_to_text.cpp b/src/speech_to_text.cpp
@@ -398,7 +398,7 @@ void SpeechToText::run() {
 	SpeechToText *speech_to_text_obj = SpeechToText::get_singleton();
 	whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 	// See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302
-	whisper_params.max_len = 1;
+	whisper_params.max_len = 0;
 	whisper_params.print_progress = false;
 	whisper_params.print_special = false;
 	whisper_params.print_realtime = false;
@@ -408,7 +408,7 @@ void SpeechToText::run() {
 	whisper_params.translate = speech_to_text_obj->params.translate;
 	whisper_params.single_segment = true;
 	whisper_params.no_timestamps = false;
-	whisper_params.token_timestamps = false;
+	whisper_params.token_timestamps = true;
 	whisper_params.max_tokens = speech_to_text_obj->params.max_tokens;
 	whisper_params.language = speech_to_text_obj->params.language.c_str();
 	whisper_params.n_threads = speech_to_text_obj->params.n_threads;
@@ -510,6 +510,7 @@ void SpeechToText::run() {
 		{
 			transcribed_msg msg;
 			const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance);
+			int64_t last_t = 0;
 			for (int i = 0; i < n_segments; ++i) {
 				const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i);
 				for (int j = 0; j < n_tokens; j++) {
@@ -524,6 +525,12 @@ void SpeechToText::run() {
 						//WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text);
 						//continue;
 					}
+					if (String(text).begins_with("[_TT_") && last_t == 0) {
+						if (j > 0) {
+							auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1);
+							last_t = last_token.t1;
+						}
+					}
 					msg.text += text;
 				}
 			}
@@ -545,18 +552,33 @@ void SpeechToText::run() {
 			 * Clear audio buffer when the size exceeds iteration threshold or
 			 * speech end is detected.
 			 */
+			if (speech_has_end) {
+				msg.text += "[_END_]";
+			}
+
 			if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
 				const auto t_now = Time::get_singleton()->get_ticks_msec();
 				const auto t_diff = t_now - speech_to_text_obj->t_last_iter;
 				speech_to_text_obj->t_last_iter = t_now;
-
 				msg.is_partial = false;
 				/**
 				 * Keep the last few samples in the audio buffer, so the next
 				 * iteration has a smoother start.
 				 */
-				std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
-				pcmf32 = std::move(last);
+				if (last_t == 0 || speech_has_end) {
+					std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
+					pcmf32 = std::move(last);
+				} else {
+					int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE);
+					if (target_index >= pcmf32.size()) {
+						std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
+						pcmf32 = std::move(last);
+					} else {
+						std::vector<float> last(pcmf32.begin() + target_index, pcmf32.end());
+						pcmf32 = std::move(last);
+					}
+				}
+
 			} else {
 				msg.is_partial = true;
 			}