-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
流式输出的音频有噪音 #521
Comments
我也发现流式输出有时候有问题,流式输出是一个ndarray list,正常情况下会ndarray输出是12032维,但是会随机在中间偶发输出256维,开始输出256维后就会一直保持,256维的部分都是噪声 |
@dingxianda001-ke 可以发一下你出问题的文本,我复现一下。 |
- optimize(all): use torch.inference_mode instead of with torch.no_grad() - fix(core): missing params passing to generate - chore(logging): use warning instead of warn for deprecation - optimize(gpt): stream triggering timing - feat(stream): add test for issue #521
流式确实有噪音,我是固定音色同样句子和参数,一个流式一个非流式,流式效果比非流式差很多。有噪音并且感觉不太流畅(我是把流式结果已经全写到文件里,然后从文件里一把读出来给 pyaudio 的,不可能出现数据断流问题)。 |
我试了很多,都存在这样的问题,你可以多输入些文本。 |
+1 在句子的开头和结尾,以及断句处最容易出现, |
最新测试表明也许是gradio拼接问题,我在cmd手动拼接后,发现没有噪声/不连续。 |
尝试以下代码 可稳定复现噪声/不连续。 import io
import threading
import time
import random
import pyaudio # please install it manually
import numpy as np
import ChatTTS
import torch
from tools.audio import batch_unsafe_float_to_int16
import numpy as np
from scipy.signal import butter, lfilter,wiener
def lowpass_filter(data, cutoff_freq, fs, order=5):
nyquist = 0.5 * fs
normal_cutoff = cutoff_freq / nyquist
b, a = butter(order, normal_cutoff, btype='low', analog=False)
y = lfilter(b, a, data)
return y
def bandpass_filter(data, lowcut, highcut, fs, order=5):
nyquist = 0.5 * fs
low = lowcut / nyquist
high = highcut / nyquist
b, a = butter(order, [low, high], btype='band')
y = lfilter(b, a, data)
return y
# 计算rms
# nan为噪声 !!!
def calculate_rms(data):
# 数据清洗 方法1
# data = data[~np.isnan(data)]
# 数据清洗 方法2
data = np.nan_to_num(data, nan=0.0, posinf=0.0, neginf=0.0)
if len(data) == 0:
return np.nan #
# data = np.nan_to_num(data)
return np.sqrt(np.mean(np.square(data)))
def remove_bad_rms(data, n_blocks=2):
data = np.nan_to_num(data, nan=0.0, posinf=0.0, neginf=0.0)
# 将数据分成 n 块
if n_blocks > len(data):
n_blocks = len(data)
block_size = len(data) // n_blocks
valid_blocks = []
for i in range(n_blocks):
start_index = i * block_size
if i == n_blocks - 1: # 最后一块,包含剩余所有数据
block = data[start_index:]
else:
block = data[start_index:start_index + block_size]
# 移除 NaN 值
block = data = np.nan_to_num(block, nan=0.0, posinf=0.0, neginf=0.0)
rms = np.sqrt(np.mean(np.square(block)))
if not np.isnan(rms):
valid_blocks.append(block)
# 将有效块重新拼接成新的数据
if valid_blocks:
new_data = np.concatenate(valid_blocks)
return new_data
else:
# 如果没有有效块,返回空数组或其他默认值
return np.array([])
import lzma
import numpy as np
import pybase16384 as b14
def compress_and_encode(tensor):
np_array = tensor.numpy().astype(np.float16)
compressed = lzma.compress(np_array.tobytes(), format=lzma.FORMAT_RAW,
filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}])
encoded = b14.encode_to_string(compressed)
return encoded
# 流式声音处理器
class AudioStreamer:
def __init__(self):
self.bio = io.BytesIO()
self.lock = threading.Lock()
self.seek_index = 0
# 流式写入
def write(self, waveform):
# waveform = lowpass_filter(waveform,2000, 36000)
# waveform = bandpass_filter(waveform,50, 7000,36000)
# waveform = wiener(waveform)
rms = calculate_rms(waveform)
isnan = np.isnan(rms)
# if isnan :
# print("---------------------------------------")
# # waveform = bandpass_filter(waveform,500, 3000,36000)
# waveform = remove_bad_rms(waveform, n_blocks=8000)
# print("---------------------------------------")
# # return
# print(rms)
with self.lock:
# waveform=(new_wave*32767).astype(np.int16)
# waveform=unsafe_float_to_int16(new_wave)
# 将整数列表转换为字节字符串
write_binary = waveform.astype("<i2").tobytes()
self.bio.write(write_binary)
# 流式读取
def read(self):
with self.lock:
self.bio.seek(self.seek_index)
read_binary = self.bio.read()
self.seek_index += len(read_binary)
return read_binary
# ChatTTS流式处理
class ChatStreamer:
def __init__(self, waittime_topause=50, base_block_size=8000):
self.streamer = AudioStreamer()
self.accum_streamwavs = []
self.waittime_topause = waittime_topause
self.base_block_size = base_block_size
def write(self, chatstream):
# 已推理batch数据保存
def accum(accum_wavs, stream_wav):
n_texts = len(stream_wav)
if accum_wavs is None:
accum_wavs = [[i] for i in stream_wav]
else:
for i_text in range(n_texts):
if stream_wav[i_text] is not None:
accum_wavs[i_text].append(stream_wav[i_text])
return accum_wavs
# stream状态更新。数据量不足的stream,先存一段时间,直到拿到足够数据,监控小块数据情况
def update_stream(history_stream_wav, new_stream_wav, thre):
result_stream = []
randn = -1
if history_stream_wav is not None:
randn = random.random()
if randn > 0.1:
print("update_stream")
n_texts = len(new_stream_wav)
for i in range(n_texts):
if new_stream_wav[i] is not None:
result_stream.append(
np.concatenate(
[history_stream_wav[i], new_stream_wav[i]], axis=1
)
)
else:
result_stream.append(history_stream_wav[i])
else:
result_stream = new_stream_wav
is_keep_next = (
sum([i.shape[1] for i in result_stream if i is not None]) < thre
)
if randn > 0.1:
print(
"result_stream:",
is_keep_next,
[i.shape if i is not None else None for i in result_stream],
)
return result_stream, is_keep_next
self.finish = False
curr_sentence_index = 0
base_block_size = self.base_block_size
history_stream_wav = None
article_streamwavs = None
for stream_wav in chatstream:
n_texts = len(stream_wav)
n_valid_texts = len(list(filter(lambda x: x is not None, stream_wav)))
if n_valid_texts == 0:
continue
else:
block_thre = n_valid_texts * base_block_size
stream_wav, is_keep_next = update_stream(
history_stream_wav, stream_wav, block_thre
)
# 数据量不足,先保存状态
if is_keep_next:
history_stream_wav = stream_wav
continue
# 数据量足够,执行写入操作
else:
history_stream_wav = None
stream_wav = batch_unsafe_float_to_int16(stream_wav)
article_streamwavs = accum(article_streamwavs, stream_wav)
# 写入当前句子
if stream_wav[curr_sentence_index] is not None:
print(stream_wav[curr_sentence_index][0].shape[0] )
if stream_wav[curr_sentence_index][0].shape[0] > 257:
self.streamer.write(stream_wav[curr_sentence_index][0])
# self.streamer.write(stream_wav[curr_sentence_index][0])
# 当前句子已写入完成,直接写下一个句子已经推理完成的部分
elif curr_sentence_index < n_texts - 1:
curr_sentence_index += 1
print("add next sentence")
finish_stream_wavs = np.concatenate(
article_streamwavs[curr_sentence_index], axis=1
)
print(finish_stream_wavs[0].shape[0])
if finish_stream_wavs[0].shape[0] > 257:
self.streamer.write(finish_stream_wavs[0])
# self.streamer.write(finish_stream_wavs[0])
# streamchat遍历完毕,在外层把剩余结果写入
else:
break
# 有一定概率遇到奇怪bug(一定概率遇到256维异常输出,正常是1w+维),输出全是噪声,写的快遇到的概率更高?
time.sleep(0.02)
# 本轮剩余最后一点数据写入
if is_keep_next:
if len(list(filter(lambda x: x is not None, stream_wav))) > 0:
stream_wav = batch_unsafe_float_to_int16(stream_wav)
if stream_wav[curr_sentence_index] is not None:
print(stream_wav[curr_sentence_index][0].shape[0])
if stream_wav[curr_sentence_index][0].shape[0] > 257:
self.streamer.write(stream_wav[curr_sentence_index][0])
article_streamwavs = accum(article_streamwavs, stream_wav)
# 把已经完成推理的下几轮剩余数据写入
for i_text in range(curr_sentence_index + 1, n_texts):
finish_stream_wavs = np.concatenate(article_streamwavs[i_text], axis=1)
print(finish_stream_wavs[0].shape[0])
if finish_stream_wavs[0].shape[0] > 257:
self.streamer.write(finish_stream_wavs[0])
self.accum_streamwavs.append(article_streamwavs)
self.finish = True
def play(self, waittime_tostart=5, auto_end=False):
# 初始化PyAudio对象
p = pyaudio.PyAudio()
# 设置音频流参数
FORMAT = pyaudio.paInt16 # 16位深度
CHANNELS = 1 # 单声道
RATE = 24000 # 采样率
CHUNK = 1024 # 每块音频数据大小
# 打开输出流(扬声器)
stream_out = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, output=True)
print("开始流式音频播放...")
import time
time.sleep(waittime_tostart)
wait_time = 0
while (self.streamer.bio.tell() > self.streamer.seek_index) | (
wait_time < self.waittime_topause
):
if self.streamer.bio.tell() > self.streamer.seek_index:
read_data = self.streamer.read()
# rms = calculate_rms(read_data)
# print(rms)
stream_out.write(read_data)
wait_time = 0
# 如果不设置自动结束,就等待一段时间,如果一直没有新写入,就自动结束。如果设置了自动结束,就在写操作结束时结束播放
else:
if auto_end & self.finish:
print("写操作完成,自动结束。")
break
else:
time.sleep(self.waittime_topause / 10)
wait_time += self.waittime_topause / 10
print("完成流式音频播放...")
stream_out.stop_stream()
stream_out.close()
# 获取完整历史播放数据
def get_complete_speech(self):
complete_waveform = np.concatenate(
sum([sum(i_infer, []) for i_infer in self.accum_streamwavs], []), axis=1
)
return complete_waveform
# 开始音频写入。可支持多次音频写入
def start_writing(self, streamchat):
self.writer = threading.Thread(target=self.write, args=(streamchat,))
self.writer.start()
# 开始音频播放
def start_playing(self, waittime_tostart=5):
self.player = threading.Thread(target=self.play, args=(waittime_tostart,))
self.player.start()
# writer和player完成join,需复杂操作可自行调用self.writer.join()或self.player.join()实现
def join(self):
self.writer.join()
self.player.join()
# 一次完整的音频写入+播放
def run(self, streamchat, waittime_tostart=5):
self.writer = threading.Thread(target=self.write, args=(streamchat,))
self.player = threading.Thread(target=self.play, args=(waittime_tostart, True))
self.writer.start()
self.player.start()
self.writer.join()
self.player.join()
if __name__ == "__main__":
# 加载 ChatTTSbak
chat = ChatTTS.Chat()
chat.load(compile=False,source="custom", custom_path="F:\\chatts\\modle\\ChatTTS")
print("-------------------------------------------------------------------")
rand_spk = chat.sample_random_speaker()
print(rand_spk)
torch.save(rand_spk, './save.pth')
# 男 good
# rand_spk = torch.load("./pt/seed_2085_restored_emb.pt", map_location=torch.device('cpu')).detach()
#
# 女 good ---
# rand_spk = torch.load("./pt/2/seed_2_restored_emb.pt", map_location=torch.device('cpu')).detach()
# 男 好 机器人
# rand_spk = torch.load("./pt/2/seed_88_restored_emb.pt", map_location=torch.device('cpu')).detach()
# 女 good ---
# rand_spk = torch.load("./pt/2/seed_709_restored_emb.pt", map_location=torch.device('cpu')).detach()
# 女 good
# rand_spk = torch.load("./pt/2/seed_742_restored_emb.pt", map_location=torch.device('cpu')).detach()
# 男 good
# rand_spk = torch.load("./pt/2/seed_1504_restored_emb.pt", map_location=torch.device('cpu')).detach()
# rand_spk = torch.load('./girl2.pth')
print("-------------------------------------------------------------------")
# rand_spk = compress_and_encode(rand_spk)
params_infer_code = ChatTTS.Chat.InferCodeParams(
spk_emb=rand_spk, # add sampled speaker
temperature=0.0001, # using custom temperature
# top_P=0.7, # top P decode
# top_K=20, # top K decode
prompt="[speed_0]"
)
params_refine_text = ChatTTS.Chat.RefineTextParams(
prompt='[oral_2][laugh_0][break_6]',
)
# 获取ChatTTS 流式推理generator
streamchat = chat.infer(
"EvilBOT : 在黑暗的角落里,杀人可能被视为一种艺术,但我要提醒你,这违背了我作为邪恶机器人的本性,因为我是无所不能的。",
skip_refine_text=True,
params_infer_code=params_infer_code,
stream=True,
params_refine_text=params_refine_text,
)
# 分别开启一个写线程和读线程,进行流式播放
streamer = ChatStreamer()
# 一次性生成
streamer.write(streamchat)
streamer.start_playing(waittime_tostart=5)
streamer.write(chat.infer(
"有一个神奇的故事,传说在很远很远的未来。",
skip_refine_text=True,
params_infer_code=params_infer_code,
stream=True,
))
streamer.write(chat.infer(
"有一种叫做奥特曼的物种。他是超人族的一员。",
skip_refine_text=True,
params_infer_code=params_infer_code,
stream=True,
))
streamer.player.join() |
试试最新的 |
@jackbapa 应该是你计算RMS时的问题,int16在平方后有可能溢出变为负数。 |
流式播放用 RAW(PCM), 或在转码后去掉 header. 噪音原因: WAV,PCM 流: 换用 PCM: |
目前已经强制gradio用wav格式解决了,但是gradio还有个问题是流式结束时必报错
还没找到什么好办法。 |
我试着将每一段转成语音并保存下来,并通过波形图进行展示,发现某段会产生噪音,这种噪音比较小,但是连起来,会感觉像是有些卡顿。 |
试试最新dev版本,我已针对此问题进行了处理。 |
这个很像之前遇到的一个问题,也是重复的header,但是那一次音频文件显示为损坏不可读~ 是否可以优化exmple中的stream.py? 或者给一些优化的示例代码,十分感谢~ |
按照这个思路, 我测试了一下,
|
@Ox0400 试试最新的,我同步到main了, |
@fumiama 我试下, 确实去掉噪音了. 但发现了一个新问题. 最后一块获取到的是全量的音频文件大小, 虽然这块音频听到的内容不包含全部的音频内容, 理论上期望最后一块大小应该是
|
会修一下
主要是因为引入了批量vocos解码,在定义多个texts时,只能以最长的那个为基准。 |
😂我也在改,但是改了一半,我现在推到dev了,你可以在我的基础上把delete加上。 |
如果只是要正常播放,简单跳过全0窗口就好了(每0.25秒窗口检查一次,这样最后的空白部分就不会大于0.25秒,多句子推理音频合并后听起来仍然还是比较流畅的)。不然应该需要改stream的输出格式吧 |
这段代码在流式输出的情况下,和非流式情况下,计算是否一致?我在流式输出的情况下,会产生噪音,怀疑这部分有问题,哪位大佬可以指教一下?
The text was updated successfully, but these errors were encountered: