-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNoise_Reduction_WOLA.m
232 lines (197 loc) · 9.96 KB
/
Noise_Reduction_WOLA.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
% Lab 3 for Digital Audio Signal Processing Lab Sessions
% Session 3: Noise reduction in the STFT domain
% R.Ali, G. Bernardi, J.Schott, A. Bertrand
% 2020
%
% The following is the skeleton code for doing the noise reduction in the
% STFT domain. Complete the missing sections accordingly
clear;
close all;
load Computed_RIRs
load HRTF
%% Exercise 3.1: ## Obtain the noisy microphone signals as outlined in the session document.
%
% SOME CONVENTIONS:
%
% Your clean speech signal should be called "speech" and is a binaural signal such that:
% speech = [speech_L speech_R] % speech_L and speech_R are the clean binaurally synthesised speech signals from session 1
%
% Your noises should be stored in one binaural variable called "noise"
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Section of code to complete %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% hyperparameters
speech_filename='audio_files/speech1.wav';
% noise_filename='audio_files/Babble_noise1.wav';
noise_filename='audio_files/White_noise1.wav';
fs_resample=fs_RIR;
RIR_length=200;
speech_length=5; % seconds
noise_length=speech_length; % seconds
SNR_correlated_noise=5; % in dB
SNR_uncorrelated_noise=30; % in dB
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% generate speech signal
[speech_raw,fs_speech]=audioread(speech_filename);
speech_raw=speech_raw(1:fs_speech*speech_length); % truncate
speech_raw=resample(speech_raw,fs_resample,fs_speech);
speech_L=fftfilt(squeeze(RIR_sources(1:RIR_length,1,:)),speech_raw); % channel 5
speech_L=mean(speech_L,2);
speech_R=fftfilt(squeeze(RIR_sources(1:RIR_length,2,:)),speech_raw); % channel 5
speech_R=mean(speech_R,2);
speech=[speech_L speech_R];
speech=fftfilt(HRTF,speech);
% generate noise signal
[noise_raw,fs_noise]=audioread(noise_filename);
noise_raw=noise_raw(1:fs_noise*noise_length);
noise_raw=resample(noise_raw,fs_resample,fs_noise);
% generate correlated noise signal
% the correlation is generated by filtering RIR_noise
noise_correlated=fftfilt(RIR_noise(1:RIR_length,:),noise_raw);
noise_correlated=fftfilt(HRTF,noise_correlated);
noise_correlated_std=std(speech(:,1))*db2mag(SNR_correlated_noise);
noise_correlated=noise_correlated_std./std(noise_correlated).*noise_correlated;
% generate uncorrelated noise signal
noise_uncorrelated_std=std(speech(:,1))*db2mag(SNR_uncorrelated_noise);
noise_uncorrelated=noise_uncorrelated_std./std(noise_raw).*noise_raw;
% Create noisy mic signals in the time domain:
noise=noise_correlated+noise_uncorrelated;
y_TD = speech + noise;
% soundsc(y_TD,fs_RIR);pause;
%% Apply WOLA analysis to observe signals in the STFT domain, Apply the SPP.
nfft = 512;
window = sqrt(hann(nfft,'periodic'));
noverlap = 2;
time = 0:1/fs_RIR:((length(speech)-1)/fs_RIR);
% ## Apply the WOLA analysis to the noisy mic. signals, the speech, and the noise.
[y_STFT,f] = WOLA_analysis(y_TD ,fs_RIR,window,nfft,noverlap); % To complete
[n_STFT,~] = WOLA_analysis(noise ,fs_RIR,window,nfft,noverlap); % To complete
[x_STFT,~] = WOLA_analysis(speech,fs_RIR,window,nfft,noverlap); % To complete
% Observe the STFT
clow = -60; chigh = 10; % lower and upper limits for signal power in spectrogram (can change for different resolutions)
[N_freqs, N_frames] = size(y_STFT(:,:,1));
% ## Compute the Speech Presence Probability on the reference microphone
% (you can try the speech-only signal or the noisy-speech in one of the microphones)
% Use the attached spp_calc.m function
[noisePowMat, SPP] = spp_calc(speech(:,1),nfft,nfft/noverlap); % To complete
figure;
subplot(2,1,1);
imagesc(time, f/1000, mag2db(abs(x_STFT(:,:,1))), [clow, chigh]); colorbar;
axis xy; set(gca,'fontsize', 14);
set(gcf,'color','w'); xlabel('Time Frame'); ylabel('Frequency (kHz)')
title('Observed Spectrogram of Noisy Mic'); hold on;
% Observe the SPP
subplot(2,1,2); imagesc(1:N_frames,f,SPP); colorbar; axis xy; set(gcf,'color','w');
set(gca,'Fontsize',14), xlabel('Time Frames'), ylabel('Frequency (Hz)'), title('Speech Presence Probability for ref mic (evaluated on the real speech signal)');
% [noisePowMat, SPP] = spp_calc(y_TD(:,1),nfft,nfft/noverlap);% To complete
% figure;
% subplot(2,1,1);
% imagesc(time, f/1000, mag2db(abs(y_STFT(:,:,1))), [clow, chigh]); colorbar;
% axis xy; set(gca,'fontsize', 14);
% set(gcf,'color','w'); xlabel('Time Frame'); ylabel('Frequency (kHz)')
% title('Observed Spectrogram of Noisy Mic'); hold on;
% % Observe the SPP
% subplot(2,1,2); imagesc(1:N_frames,f,SPP); colorbar; axis xy; set(gcf,'color','w');
% set(gca,'Fontsize',14), xlabel('Time Frames'), ylabel('Frequency (Hz)'), title('Speech Presence Probability for ref mic (evaluated on the noisy output signal)');
%% Exercise 3.2: ## Implementation of the MWF
num_mics = 2;
Rnn = cell(N_freqs,1); Rnn(:) = {1e-6*ones(num_mics,num_mics)}; % Noise Only (NO) corr. matrix. Initialize to small random values
Ryy = cell(N_freqs,1); Ryy(:) = {1e-6*ones(num_mics,num_mics)}; % Speech + Noise (SPN) corr. matrix. Initialize to small random values
lambda = 0.995; % Forgetting factors for correlation matrices - can change
SPP_thr = 0.9; % Threshold for SPP - can change
% For MWF filter for left ear
S_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);
X_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);
N_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);
W_mvdr_mwfL = (1/num_mics)*ones(num_mics,N_freqs,num_mics);
skip_thershold = 2;
% STFT Processing
% Looping through each time frame and each frequency bin
tic
for l=2:N_frames % Time index
for k = 1:N_freqs % Freq index
% Create a vector of mic. signals
Y_kl = squeeze(y_STFT(k,l,1:num_mics)); % M x 1 noisy mic sigs for this freq and time frame
X_kl = squeeze(x_STFT(k,l,1:num_mics));
N_kl = squeeze(n_STFT(k,l,1:num_mics));
% ## Update the correlation matrices using the forgetting factor.
% Threshold the SPP in order to distinguish between periods of speech and non-speech activity
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Section of code to complete (3 lines) %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
if(SPP(k,l)>SPP_thr) % speech+noise
Ryy{k}=lambda^2*Ryy{k}+(1-lambda^2).*Y_kl*Y_kl';
else % noise only
Rnn{k} = lambda^2*Rnn{k}+(1-lambda^2).*Y_kl*Y_kl';
end
% Computing the MWF filter using a generalised eigenvalue
% decomposition of the correlation matrices.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Section of code to complete ~ 10 lines %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[V,d] = eig(Ryy{k},Rnn{k},'vector');
[d,ind] = sort(d,'descend','ComparisonMethod','real');
V = V(:,ind);
Qh = pinv(V); Q= Qh';
sig_yy=V'*Ryy{k}*V; sig_yy=[sig_yy(1,1) 0;0 sig_yy(2,2)];
sig_nn=V'*Rnn{k}*V; sig_nn=[sig_nn(1,1) 0;0 sig_nn(2,2)];
sig_ss=sig_yy(1,1)-sig_nn(1,1);
for m=1:num_mics
rho_ss=sig_ss;
h=Q(:,m);
W_scalar = rho_ss/(rho_ss+1/(h'*pinv(Rnn{k})*h));
W_spectral=Qh(1,1)*(1/(h'*pinv(Rnn{k})*h))*pinv(Rnn{k})*h;
W_update=W_scalar*W_spectral;
if(max(abs(W_update))<skip_thershold)
W_mvdr_mwfL(:,k,m)=W_update;
end
% Filtering the noisy speech, the speech-only, and the noise-only.
S_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*Y_kl;
X_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*X_kl;
N_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*N_kl;
end % end mics
end % end freqs
end % end time frames
toc
%% Observe processed STFTs
figure;
subplot(4,1,1);
spectrogram_plot(y_STFT(:,:,1), clow, chigh, time, f);
title("Multi Channel: left ear: noisy spectrogram");
subplot(4,1,2);
spectrogram_plot(S_mvdr_mwfL_stft(:,:,1), clow, chigh, time, f);
title("Multi Channel: left ear: filtered spectrogram");
subplot(4,1,3);
spectrogram_plot(y_STFT(:,:,2), clow, chigh, time, f);
title("Multi Channel: right ear: noisy spectrogram");
subplot(4,1,4);
spectrogram_plot(S_mvdr_mwfL_stft(:,:,1), clow, chigh, time, f);
title("Multi Channel: right ear: filtered spectrogram");
%% Apply the synthesis stage of the WOLA framework to obtain the time domain equivalents:
s_mwfL = WOLA_synthesis(S_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of S_mvdr_mwfL_stft)
x_mwfL = WOLA_synthesis(X_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of X_mvdr_mwfL_stft)
n_mwfL = WOLA_synthesis(N_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of N_mvdr_mwfL_stft)
%% EVALUATION
SNR_in = snr(y_TD(:,1),noise(:,1)); % Compute input SNR
SNR_out = snr(s_mwfL(:,1),n_mwfL(:,1));% Compute output SNR
fprintf("Multi Channel WMF: Input SNR is %d and output SNR is %d.\n",SNR_in,SNR_out);
% note: change the place of noise mic and see what is going on
figure;
plot(y_TD(:,1),'DisplayName','Original noisy signal');hold on;
plot(s_mwfL(:,1),'DisplayName','Filtered signal');hold on;
plot(x_mwfL(:,1),'DisplayName','Filtered clean signal');hold on;
plot(speech(:,1),'DisplayName','Original clean signal');
legend({'Original noisy signal','Filtered noisy signal','Filtered clean signal','Original clean signal'},'Location','southwest')
%% the following code is stored in spectrogram_plot.m
% function spectrogram_plot(input, clow, chigh, time, f)
% imagesc(time,f/1000,mag2db(abs(input)), [clow, chigh]);
% colorbar; axis xy; set(gcf,'color','w');
% set(gca,'Fontsize',14);
% xlabel('Time (s)');
% ylabel('Frequency (Hz)');
% end
%% LISTEN
% fprintf("Original binaural clean signal, ");soundsc(speech,fs_RIR);pause;
% fprintf("filtered binaural clean signal, ");soundsc(x_mwfL,fs_RIR);pause;
%% debug result
% figure;plot(x_mwfL(:,1)-x_mwfL(:,2));hold on;plot(speech(:,1)-speech(:,2)); % binaural debug