Noise_Reduction_WOLA.m

% Lab 3 for Digital Audio Signal Processing Lab Sessions
% Session 3: Noise reduction in the STFT domain
% R.Ali, G. Bernardi, J.Schott, A. Bertrand
% 2020
%
% The following is the skeleton code for doing the noise reduction in the
% STFT domain. Complete the missing sections accordingly

clear;
close all;
load Computed_RIRs
load HRTF
%% Exercise 3.1: ## Obtain the noisy microphone signals as outlined in the session document.
%
% SOME CONVENTIONS:
%
% Your clean speech signal should be called "speech" and is a binaural signal such that:
% speech = [speech_L speech_R]  % speech_L and speech_R are the clean binaurally synthesised speech signals from session 1
% 
% Your noises should be stored in one binaural variable called "noise"


    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Section of code to complete %
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % hyperparameters
    speech_filename='audio_files/speech1.wav';
%     noise_filename='audio_files/Babble_noise1.wav';
    noise_filename='audio_files/White_noise1.wav';
    fs_resample=fs_RIR;
    RIR_length=200;
    speech_length=5; % seconds
    noise_length=speech_length; % seconds
    SNR_correlated_noise=5; % in dB
    SNR_uncorrelated_noise=30; % in dB
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % generate speech signal
    [speech_raw,fs_speech]=audioread(speech_filename);
    speech_raw=speech_raw(1:fs_speech*speech_length); % truncate
    speech_raw=resample(speech_raw,fs_resample,fs_speech);
    speech_L=fftfilt(squeeze(RIR_sources(1:RIR_length,1,:)),speech_raw); % channel 5
    speech_L=mean(speech_L,2);
    speech_R=fftfilt(squeeze(RIR_sources(1:RIR_length,2,:)),speech_raw); % channel 5
    speech_R=mean(speech_R,2);
    speech=[speech_L speech_R];
    speech=fftfilt(HRTF,speech);
    % generate noise signal
    [noise_raw,fs_noise]=audioread(noise_filename);
    noise_raw=noise_raw(1:fs_noise*noise_length);
    noise_raw=resample(noise_raw,fs_resample,fs_noise);
    % generate correlated noise signal
    % the correlation is generated by filtering RIR_noise
    noise_correlated=fftfilt(RIR_noise(1:RIR_length,:),noise_raw);
    noise_correlated=fftfilt(HRTF,noise_correlated);
    noise_correlated_std=std(speech(:,1))*db2mag(SNR_correlated_noise);
    noise_correlated=noise_correlated_std./std(noise_correlated).*noise_correlated;
    % generate uncorrelated noise signal
    noise_uncorrelated_std=std(speech(:,1))*db2mag(SNR_uncorrelated_noise);
    noise_uncorrelated=noise_uncorrelated_std./std(noise_raw).*noise_raw;


% Create noisy mic signals in the time domain:
noise=noise_correlated+noise_uncorrelated;
y_TD = speech + noise; 
% soundsc(y_TD,fs_RIR);pause;

%% Apply WOLA analysis to observe signals in the STFT domain, Apply the SPP.

nfft = 512; 
window = sqrt(hann(nfft,'periodic'));
noverlap = 2;
time = 0:1/fs_RIR:((length(speech)-1)/fs_RIR);

% ## Apply the WOLA analysis to the noisy mic. signals, the speech, and the noise.

[y_STFT,f] = WOLA_analysis(y_TD  ,fs_RIR,window,nfft,noverlap); % To complete
[n_STFT,~] = WOLA_analysis(noise ,fs_RIR,window,nfft,noverlap); % To complete
[x_STFT,~] = WOLA_analysis(speech,fs_RIR,window,nfft,noverlap); % To complete

% Observe the STFT
clow = -60; chigh = 10; % lower and upper limits for signal power in spectrogram (can change for different resolutions)
[N_freqs, N_frames] = size(y_STFT(:,:,1));

% ## Compute the Speech Presence Probability on the reference microphone
% (you can try the speech-only signal or the noisy-speech in one of the microphones)
% Use the attached spp_calc.m function

[noisePowMat, SPP] = spp_calc(speech(:,1),nfft,nfft/noverlap); % To complete

figure; 
subplot(2,1,1);
imagesc(time, f/1000, mag2db(abs(x_STFT(:,:,1))), [clow, chigh]); colorbar; 
axis xy; set(gca,'fontsize', 14);
set(gcf,'color','w'); xlabel('Time Frame'); ylabel('Frequency (kHz)')
title('Observed Spectrogram of Noisy Mic'); hold on;
% Observe the SPP
subplot(2,1,2); imagesc(1:N_frames,f,SPP); colorbar; axis xy; set(gcf,'color','w');  
set(gca,'Fontsize',14), xlabel('Time Frames'), ylabel('Frequency (Hz)'), title('Speech Presence Probability for ref mic (evaluated on the real speech signal)');

% [noisePowMat, SPP] = spp_calc(y_TD(:,1),nfft,nfft/noverlap);% To complete
% figure; 
% subplot(2,1,1);
% imagesc(time, f/1000, mag2db(abs(y_STFT(:,:,1))), [clow, chigh]); colorbar; 
% axis xy; set(gca,'fontsize', 14);
% set(gcf,'color','w'); xlabel('Time Frame'); ylabel('Frequency (kHz)')
% title('Observed Spectrogram of Noisy Mic'); hold on;
% % Observe the SPP
% subplot(2,1,2); imagesc(1:N_frames,f,SPP); colorbar; axis xy; set(gcf,'color','w');  
% set(gca,'Fontsize',14), xlabel('Time Frames'), ylabel('Frequency (Hz)'), title('Speech Presence Probability for ref mic (evaluated on the noisy output signal)');

%%  Exercise 3.2: ## Implementation of the MWF
num_mics = 2;

Rnn = cell(N_freqs,1);  Rnn(:) = {1e-6*ones(num_mics,num_mics)};      % Noise Only (NO) corr. matrix. Initialize to small random values
Ryy = cell(N_freqs,1);  Ryy(:) = {1e-6*ones(num_mics,num_mics)};      % Speech + Noise (SPN) corr. matrix. Initialize to small random values
lambda = 0.995;                                                       % Forgetting factors for correlation matrices - can change
SPP_thr = 0.9;                                                       % Threshold for SPP - can change

% For MWF filter for left ear
S_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);         
X_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);         
N_mvdr_mwfL_stft = zeros(N_freqs,N_frames,num_mics);  
W_mvdr_mwfL = (1/num_mics)*ones(num_mics,N_freqs,num_mics); 
skip_thershold = 2;

% STFT Processing
% Looping through each time frame and each frequency bin
tic
for l=2:N_frames % Time index
    
    for k = 1:N_freqs % Freq index
        
        % Create a vector of mic. signals
        Y_kl = squeeze(y_STFT(k,l,1:num_mics));  % M x 1 noisy mic sigs for this freq and time frame
        X_kl = squeeze(x_STFT(k,l,1:num_mics));
        N_kl = squeeze(n_STFT(k,l,1:num_mics));

        % ## Update the correlation matrices using the forgetting factor.
        % Threshold the SPP in order to distinguish between periods of speech and non-speech activity
        
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        % Section of code to complete (3 lines) %
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        if(SPP(k,l)>SPP_thr) % speech+noise
            Ryy{k}=lambda^2*Ryy{k}+(1-lambda^2).*Y_kl*Y_kl';
        else % noise only
            Rnn{k} = lambda^2*Rnn{k}+(1-lambda^2).*Y_kl*Y_kl'; 
        end

        % Computing the MWF filter using a generalised eigenvalue
        % decomposition of the correlation matrices.
        
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        % Section of code to complete ~ 10 lines %
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        [V,d] = eig(Ryy{k},Rnn{k},'vector');
        [d,ind] = sort(d,'descend','ComparisonMethod','real');
        V = V(:,ind);
        Qh = pinv(V); Q= Qh';
        sig_yy=V'*Ryy{k}*V; sig_yy=[sig_yy(1,1) 0;0 sig_yy(2,2)];
        sig_nn=V'*Rnn{k}*V; sig_nn=[sig_nn(1,1) 0;0 sig_nn(2,2)];
        sig_ss=sig_yy(1,1)-sig_nn(1,1);
        for m=1:num_mics
            rho_ss=sig_ss; 
            h=Q(:,m);
            W_scalar = rho_ss/(rho_ss+1/(h'*pinv(Rnn{k})*h));
            W_spectral=Qh(1,1)*(1/(h'*pinv(Rnn{k})*h))*pinv(Rnn{k})*h;
            W_update=W_scalar*W_spectral; 
            if(max(abs(W_update))<skip_thershold)
                W_mvdr_mwfL(:,k,m)=W_update;
            end
            % Filtering the noisy speech, the speech-only, and the noise-only.
            S_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*Y_kl;
            X_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*X_kl;
            N_mvdr_mwfL_stft(k,l,m) = W_mvdr_mwfL(:,k,m)'*N_kl;
        end % end mics
    end % end freqs
end % end time frames
toc


%% Observe processed STFTs
    figure;
    subplot(4,1,1); 
    spectrogram_plot(y_STFT(:,:,1), clow, chigh, time, f);
    title("Multi Channel: left ear: noisy spectrogram"); 
    subplot(4,1,2);
    spectrogram_plot(S_mvdr_mwfL_stft(:,:,1), clow, chigh, time, f);
    title("Multi Channel: left ear: filtered spectrogram");
    subplot(4,1,3);
    spectrogram_plot(y_STFT(:,:,2), clow, chigh, time, f);
    title("Multi Channel: right ear: noisy spectrogram");
    subplot(4,1,4); 
    spectrogram_plot(S_mvdr_mwfL_stft(:,:,1), clow, chigh, time, f);
    title("Multi Channel: right ear: filtered spectrogram");

%% Apply the synthesis stage of the WOLA framework to obtain the time domain equivalents:

s_mwfL = WOLA_synthesis(S_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of S_mvdr_mwfL_stft)
x_mwfL = WOLA_synthesis(X_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of X_mvdr_mwfL_stft) 
n_mwfL = WOLA_synthesis(N_mvdr_mwfL_stft,window,nfft,noverlap); % To complete (time-domain version of N_mvdr_mwfL_stft)

%% EVALUATION

SNR_in = snr(y_TD(:,1),noise(:,1)); % Compute input SNR
SNR_out = snr(s_mwfL(:,1),n_mwfL(:,1));% Compute output SNR
fprintf("Multi Channel WMF: Input SNR is %d and output SNR is %d.\n",SNR_in,SNR_out);

% note: change the place of noise mic and see what is going on
figure;
plot(y_TD(:,1),'DisplayName','Original noisy signal');hold on;
plot(s_mwfL(:,1),'DisplayName','Filtered signal');hold on;
plot(x_mwfL(:,1),'DisplayName','Filtered clean signal');hold on;
plot(speech(:,1),'DisplayName','Original clean signal');
legend({'Original noisy signal','Filtered noisy signal','Filtered clean signal','Original clean signal'},'Location','southwest')

%% the following code is stored in spectrogram_plot.m
% function spectrogram_plot(input, clow, chigh, time, f)
%   imagesc(time,f/1000,mag2db(abs(input)), [clow, chigh]); 
%   colorbar; axis xy; set(gcf,'color','w');
%   set(gca,'Fontsize',14); 
%   xlabel('Time (s)'); 
%   ylabel('Frequency (Hz)');
% end

%% LISTEN
% fprintf("Original binaural clean signal, ");soundsc(speech,fs_RIR);pause;
% fprintf("filtered binaural clean signal, ");soundsc(x_mwfL,fs_RIR);pause;

%% debug result
% figure;plot(x_mwfL(:,1)-x_mwfL(:,2));hold on;plot(speech(:,1)-speech(:,2)); % binaural debug