-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
83 lines (60 loc) · 3.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Control Flow:
# Inputs: mp3 file descriptor, destination file descriptor
# Outputs: wav file at destination file descriptor
# PREPROCESSING:
# 0. Use librosa to resample mp3 file as a 44.1kHz wav file
# 1. (if stereo -> do everything twice, once for each channel)
# len > 1.5s |-> Split mp3 into 1 second chunks with 0.5 second overlap (if \in [1, 1.5]s, pad with zeros at beginning and end to 1.5 seconds)
# len < 1s |-> Pad with zeros at beginning and end to 1 second
# len = 1s |-> Do nothing extra
# len = 0s |-> Error
# 2. For each chunk, compute the STFT with hop len 512 and nfft 2048
# 3. Pad magnitude and phase to be of shape 1025, 87
# MODEL:
# 4. Use polyfit method to extend the transients
# 5. Plug in to model and get output
# POSTPROCESSING:
# 6. Stitch the spectrograms together along the time axis
# 7. Run ISTFT with modified griffin lim algorithm on the stitched spectrogram to get the output audio
# |-> Modify griffin lim algorithm to use/start with the phase from the output of the model (should try with and without)
# 8. Write output audio to file
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import soundfile as sf
import preprocessing
import predict
import postprocessing
import sys
# TODO - noise reduction?
# Identify artifats (perhaps with std dev for magnitude coefficients, and ignore normalization for those)
# Item - by item normalization (deal with batchnorm) or use moving average/minmax from original mp3
def separate_channels(pred_mag, pred_phase):
# Assuming even indices are left channel and odd indices are right channel
pred_mag_left = pred_mag[::2] # Slices out the left channel magnitudes
pred_phase_left = pred_phase[::2] # Slices out the left channel phases
pred_mag_right = pred_mag[1::2] # Slices out the right channel magnitudes
pred_phase_right = pred_phase[1::2] # Slices out the right channel phases
return pred_mag_left, pred_phase_left, pred_mag_right, pred_phase_right
if __name__ == "__main__":
MODEL_FILEPATH = 'ResUNet_LSTM_small3GB.keras'
# command line args 1 and 2 are the input and output filepaths
input_path = sys.argv[1]
output_path = sys.argv[2]
# preprocessing
print("Preprocessing")
combined_mp3 = preprocessing.preprocess(input_path)
# predict
print("Predicting")
mags, phases = predict.polyfit_and_predict(combined_mp3, MODEL_FILEPATH)
# Separate the predictions into left and right channels
pred_mag_left, pred_phase_left, pred_mag_right, pred_phase_right = separate_channels(mags, phases)
# postprocessing
print("Postprocessing")
output_audio_gl_left = postprocessing.postprocess(pred_mag_left, pred_phase_left, visualize=True)
output_audio_gl_right = postprocessing.postprocess(pred_mag_right, pred_phase_right, visualize=True)
# Combine the left and right channels into stereo
output_audio_gl_stereo = np.vstack((output_audio_gl_left, output_audio_gl_right)).T
# Save both as wav files to output_path
print("Saving")
sf.write(output_path + 'upscaled_mp3.wav', output_audio_gl_stereo, 44100, "PCM_24")