Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix missing text #40

Merged
merged 3 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions bin/addons/godot_whisper/capture_stream_to_text.gd
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func _add_timer():
var timer_node = Timer.new()
timer_node.one_shot = false
timer_node.autostart = true
timer_node.wait_time = 1
timer_node.wait_time = 0.5
Ughuuu marked this conversation as resolved.
Show resolved Hide resolved
add_child(timer_node)
timer_node.connect("timeout",self._on_timer_timeout)

Expand All @@ -140,7 +140,15 @@ func _on_timer_timeout():
if is_running:
_speech_to_text_singleton.add_audio_buffer(buffer)

func _remove_special_characters(message: String):
func _remove_special_characters(message: String, is_partial: bool):
if is_partial == false:
Ughuuu marked this conversation as resolved.
Show resolved Hide resolved
if message.ends_with("[_END_]"):
message = message.trim_suffix("[_END_]")
else:
var end_character := message.find("[_TT_")
if end_character != -1:
message = message.substr(0, end_character)

var special_characters = [ \
{ "start": "[", "end": "]" }, \
{ "start": "<", "end": ">" }]
Expand All @@ -154,7 +162,7 @@ func _remove_special_characters(message: String):

func _update_transcribed_msgs_func(process_time_ms: int, transcribed_msgs: Array):
for transcribed_msg in transcribed_msgs:
var cur_text = _remove_special_characters(transcribed_msg["text"])
var cur_text = _remove_special_characters(transcribed_msg["text"], transcribed_msg["is_partial"])

if transcribed_msg["is_partial"]==false:
if cur_text.ends_with("?") or cur_text.ends_with(",") or cur_text.ends_with("."):
Expand Down
32 changes: 27 additions & 5 deletions src/speech_to_text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ void SpeechToText::run() {
SpeechToText *speech_to_text_obj = SpeechToText::get_singleton();
whisper_full_params whisper_params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
// See here for example https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp#L302
whisper_params.max_len = 1;
whisper_params.max_len = 0;
whisper_params.print_progress = false;
whisper_params.print_special = false;
whisper_params.print_realtime = false;
Expand All @@ -408,7 +408,7 @@ void SpeechToText::run() {
whisper_params.translate = speech_to_text_obj->params.translate;
whisper_params.single_segment = true;
whisper_params.no_timestamps = false;
whisper_params.token_timestamps = false;
whisper_params.token_timestamps = true;
whisper_params.max_tokens = speech_to_text_obj->params.max_tokens;
whisper_params.language = speech_to_text_obj->params.language.c_str();
whisper_params.n_threads = speech_to_text_obj->params.n_threads;
Expand Down Expand Up @@ -510,6 +510,7 @@ void SpeechToText::run() {
{
transcribed_msg msg;
const int n_segments = whisper_full_n_segments(speech_to_text_obj->context_instance);
int64_t last_t = 0;
for (int i = 0; i < n_segments; ++i) {
const int n_tokens = whisper_full_n_tokens(speech_to_text_obj->context_instance, i);
for (int j = 0; j < n_tokens; j++) {
Expand All @@ -524,6 +525,12 @@ void SpeechToText::run() {
//WARN_PRINT("Skipping token low plog " + String::num(token.p) + " " + String::num(token.plog) + " " + text);
//continue;
}
if (String(text).begins_with("[_TT_") && last_t == 0) {
if (j > 0) {
auto last_token = whisper_full_get_token_data(speech_to_text_obj->context_instance, i, j - 1);
last_t = last_token.t1;
}
}
msg.text += text;
}
}
Expand All @@ -545,18 +552,33 @@ void SpeechToText::run() {
* Clear audio buffer when the size exceeds iteration threshold or
* speech end is detected.
*/
if (speech_has_end) {
msg.text += "[_END_]";
}

if (pcmf32.size() > n_samples_iter_threshold || speech_has_end) {
const auto t_now = Time::get_singleton()->get_ticks_msec();
const auto t_diff = t_now - speech_to_text_obj->t_last_iter;
speech_to_text_obj->t_last_iter = t_now;

msg.is_partial = false;
/**
* Keep the last few samples in the audio buffer, so the next
* iteration has a smoother start.
*/
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
if (last_t == 0 || speech_has_end) {
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
} else {
int target_index = int(last_t / 100.0 * WHISPER_SAMPLE_RATE);
if (target_index >= pcmf32.size()) {
std::vector<float> last(pcmf32.end() - n_samples_keep_iter, pcmf32.end());
pcmf32 = std::move(last);
} else {
std::vector<float> last(pcmf32.begin() + target_index, pcmf32.end());
pcmf32 = std::move(last);
}
}

} else {
msg.is_partial = true;
}
Expand Down