-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpreprocessing.py
81 lines (72 loc) · 2.99 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from fastai.vision import *
import librosa, librosa.display
# Based on: https://www.kaggle.com/daisukelab/cnn-2d-basic-solution-powered-by-fast-ai
conf = {'sr': 44100, 'duration': 2, 'fmin': 20, 'n_mels': 128}
conf['hop_length'] = 347*conf['duration']
conf['fmax'], conf['n_fft'] = conf['sr'] // 2, conf['n_mels'] * 20
conf['samples'] = conf['sr'] * conf['duration']
def read_audio(conf, pathname, trim_long_data):
y, sr = librosa.load(pathname, sr=conf['sr'])
# trim silence
if 0 < len(y): # workaround: 0 length causes error
y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
# make it unified length to conf.samples
if len(y) > conf['samples']: # long enough
if trim_long_data:
y = y[0:0+conf['samples']]
else: # pad blank
padding = conf['samples'] - len(y) # add padding at both ends
offset = padding // 2
y = np.pad(y, (offset, conf['samples'] - len(y) - offset), 'constant')
return y
def audio_to_melspectrogram(conf, audio):
conf2 = conf.copy()
del conf2['duration'], conf2['samples']
spectrogram = librosa.feature.melspectrogram(audio, **conf2)
spectrogram = librosa.power_to_db(spectrogram)
spectrogram = spectrogram.astype(np.float32)
return spectrogram
def show_melspectrogram(conf, mels, title='Log-frequency power spectrogram'):
librosa.display.specshow(mels, x_axis='time', y_axis='mel', sr=conf['sr'], hop_length=conf['hop_length'], fmin=conf['fmin'], fmax=conf['fmax'])
plt.colorbar(format='%+2.0f dB')
plt.title(title)
plt.show()
def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
x = read_audio(conf, pathname, trim_long_data)
mels = audio_to_melspectrogram(conf, x)
if debug_display:
IPython.display.display(IPython.display.Audio(x, rate=conf['sr']))
show_melspectrogram(conf, mels)
return mels
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
# Standardize
mean = mean or X.mean()
std = std or X.std()
Xstd = (X - mean) / (std + eps)
_min, _max = Xstd.min(), Xstd.max()
norm_max = norm_max or _max
norm_min = norm_min or _min
if (_max - _min) > eps:
# Normalize to [0, 255]
V = Xstd
V[V < norm_min] = norm_min
V[V > norm_max] = norm_max
V = 255 * (V - norm_min) / (norm_max - norm_min)
V = V.astype(np.uint8)
else:
# Just zero
V = np.zeros_like(Xstd, dtype=np.uint8)
return V
def convert_wav_to_image(df, fold, source):
X = []
for i, row in progress_bar(df.iterrows(), total=len(df)):
x = read_as_melspectrogram(conf, source/str(row.fname), trim_long_data=False)
x_color = mono_to_color(x)
X.append(x_color)
return X
def convert_wav_to_png(row):
fname = row[1][0]
x = read_as_melspectrogram(conf, path_source/str(fname), trim_long_data=False)
x_color = mono_to_color(x)
fsave = path_save/f'{fname[:-4]}.png'
PIL.Image.fromarray(x_color).save(fsave)