This repository has been archived by the owner on Oct 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathprepare.py
executable file
·53 lines (43 loc) · 2.06 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
# Copyright (c) 2020 Xiaomi Corporation (authors: Junbo Zhang, Haowen Qiu)
# Apache 2.0
import multiprocessing
import os
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, Mfcc, LilcomFilesWriter, WavAugmenter
from lhotse.dataset import SpeechRecognitionDataset
from lhotse.recipes.librispeech import download_and_untar, prepare_librispeech, dataset_parts_full
print("All dataset parts: ", dataset_parts_full)
dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100')
print("Parts we will prepare: ", dataset_parts)
corpus_dir = '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
output_dir = 'exp/data'
librispeech_manifests = prepare_librispeech(corpus_dir, dataset_parts,
output_dir)
use_data_augmentation = False
augmenter = WavAugmenter.create_predefined(
'pitch_reverb_tdrop',
sampling_rate=16000) if use_data_augmentation else None
# It seems when spawning multiple Python subprocesses with the same sox handle it raises "std::runtime_error: Couldn't close file"
# The issue seems to happen only in a Jupyter notebook on Mac OS X, hence the work around below.
if use_data_augmentation:
num_jobs = 1
else:
num_jobs = os.cpu_count()
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
for partition, manifests in librispeech_manifests.items():
print(partition)
with LilcomFilesWriter(f'{output_dir}/feats_{partition}') as storage, \
ProcessPoolExecutor(num_jobs, mp_context=multiprocessing.get_context("spawn")) as ex:
cut_set = CutSet.from_manifests(
recordings=manifests['recordings'],
supervisions=manifests['supervisions']).compute_and_store_features(
extractor=Fbank(),
storage=storage,
augment_fn=augmenter if 'train' in partition else None,
executor=ex)
librispeech_manifests[partition]['cuts'] = cut_set
cut_set.to_json(output_dir + f'/cuts_{partition}.json.gz')