Skip to content

Commit

Permalink
fix missing text (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
aiaimimi0920 authored Jan 16, 2024
1 parent 25826b7 commit dc741ff
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 10 deletions.
29 changes: 24 additions & 5 deletions bin/addons/godot_whisper/capture_stream_to_text.gd
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func _add_timer():
var timer_node = Timer.new()
timer_node.one_shot = false
timer_node.autostart = true
timer_node.wait_time = 1
timer_node.wait_time = 1.0
add_child(timer_node)
timer_node.connect("timeout",self._on_timer_timeout)

Expand All @@ -140,7 +140,16 @@ func _on_timer_timeout():
if is_running:
_speech_to_text_singleton.add_audio_buffer(buffer)

func _remove_special_characters(message: String):
func _remove_special_characters(message: String, is_partial: bool):
if is_partial == false:
if message.ends_with("[_END_]"):
message = message.trim_suffix("[_END_]")
else:
var end_character := message.find("[_TT_")
if end_character != -1:
message = message.substr(0, end_character) + "{SPLIT}" + message.substr(end_character)


var special_characters = [ \
{ "start": "[", "end": "]" }, \
{ "start": "<", "end": ">" }]
Expand All @@ -150,18 +159,28 @@ func _remove_special_characters(message: String):
var end_character := message.find(special_character["end"])
if end_character != -1:
message = message.substr(0, begin_character) + message.substr(end_character + 1)

message = message.trim_suffix("{SPLIT}")
return message

func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array):
for transcribed_msg in transcribed_msgs:
var cur_text = _remove_special_characters(transcribed_msg["text"])

var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"])
if transcribed_msg["is_partial"]==false:
if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."):
pass
else:
cur_text = cur_text + "."
update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)

if transcribed_msg["is_partial"]==false:
var split_character := cur_text.find("{SPLIT}")
if split_character!=-1:
update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text.substr(0, split_character), process_time_ms)
update_transcribed_msg.emit(_last_index+1, true, cur_text.substr(split_character+7), process_time_ms)
else:
update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)
else:
update_transcribed_msg.emit(_last_index, transcribed_msg["is_partial"], cur_text, process_time_ms)
if transcribed_msg["is_partial"]==false:
_last_index+=1

Expand Down
32 changes: 27 additions & 5 deletions src/speech_to_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ void SpeechToText::run() {
SpeechToText *speech_to_text_obj = SpeechToText::get_singleton();
whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
// See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302
whisper_params.max_len = 1;
whisper_params.max_len = 0;
whisper_params.print_progress = false;
whisper_params.print_special = false;
whisper_params.print_realtime = false;
Expand All @@ -408,7 +408,7 @@ void SpeechToText::run() {
whisper_params.translate = speech_to_text_obj->params.translate;
whisper_params.single_segment = true;
whisper_params.no_timestamps = false;
whisper_params.token_timestamps = false;
whisper_params.token_timestamps = true;
whisper_params.max_tokens = speech_to_text_obj->params.max_tokens;
whisper_params.language = speech_to_text_obj->params.language.c_str();
whisper_params.n_threads = speech_to_text_obj->params.n_threads;
Expand Down Expand Up @@ -510,6 +510,7 @@ void SpeechToText::run() {
{
transcribed_msg msg;
const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance);
int64_t last_t = 0;
for (int i = 0; i < n_segments; ++i) {
const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i);
for (int j = 0; j < n_tokens; j++) {
Expand All @@ -524,6 +525,12 @@ void SpeechToText::run() {
//WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text);
//continue;
}
if (String(text).begins_with("[_TT_") && last_t == 0) {
if (j > 0) {
auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1);
last_t = last_token.t1;
}
}
msg.text += text;
}
}
Expand All @@ -545,18 +552,33 @@ void SpeechToText::run() {
* Clear audio buffer when the size exceeds iteration threshold or
* speech end is detected.
*/
if (speech_has_end) {
msg.text += "[_END_]";
}

if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
const auto t_now = Time::get_singleton()->get_ticks_msec();
const auto t_diff = t_now - speech_to_text_obj->t_last_iter;
speech_to_text_obj->t_last_iter = t_now;

msg.is_partial = false;
/**
* Keep the last few samples in the audio buffer, so the next
* iteration has a smoother start.
*/
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
if (last_t == 0 || speech_has_end) {
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
} else {
int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE);
if (target_index >= pcmf32.size()) {
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
} else {
std::vector<float> last(pcmf32.begin() + target_index, pcmf32.end());
pcmf32 = std::move(last);
}
}

} else {
msg.is_partial = true;
}
Expand Down

0 comments on commit dc741ff

Please sign in to comment.