diff --git a/misc/scripts/alignment/state_align/forced_alignment.py b/misc/scripts/alignment/state_align/forced_alignment.py index 55da14a2..3ef84522 100644 --- a/misc/scripts/alignment/state_align/forced_alignment.py +++ b/misc/scripts/alignment/state_align/forced_alignment.py @@ -1,12 +1,14 @@ import os, sys import time +import random +import glob from sys import argv, stderr from subprocess import check_call, Popen, CalledProcessError, PIPE from mean_variance_norm import MeanVarianceNorm # string constants for various shell calls -STATE_NUM=5 +STATE_NUM = 5 F = str(0.01) SFAC = str(5.0) PRUNING = [str(i) for i in (250., 150., 2000.)] @@ -15,31 +17,31 @@ HMMDEFS = 'hmmdefs' VFLOORS = 'vFloors' -## -HTKDIR = path/to/tools/htk -HCompV = os.path.join(HTKDIR, 'HCompV') -HCopy = os.path.join(HTKDIR, 'HCopy' ) -HERest = os.path.join(HTKDIR, 'HERest') -HHEd = os.path.join(HTKDIR, 'HHEd' ) -HVite = os.path.join(HTKDIR, 'HVite' ) class ForcedAlignment(object): - def __init__(self): + def __init__(self, htk_dir): self.proto = None self.phoneme_mlf = None + self.set_htk_dir(htk_dir) + + def set_htk_dir(self, htk_dir): + self.HCompV = os.path.join(htk_dir, 'HCompV') + self.HCopy = os.path.join(htk_dir, 'HCopy') + self.HERest = os.path.join(htk_dir, 'HERest') + self.HHEd = os.path.join(htk_dir, 'HHEd') + self.HVite = os.path.join(htk_dir, 'HVite') def _make_proto(self): ## make proto fid = open(self.proto, 'w') means = ' '.join(['0.0' for _ in range(39)]) - varg = ' '.join(['1.0' for _ in range(39)]) + varg = ' '.join(['1.0' for _ in range(39)]) fid.write("""~o 39 ~h "proto" - 7 -""") - for i in range(2, STATE_NUM+2): + 7""") + for i in range(2, STATE_NUM + 2): fid.write(' {0}\n 39\n{1}\n'.format(i, means)) fid.write(' 39\n{0}\n'.format(varg)) fid.write(""" 7 @@ -55,20 +57,20 @@ def _make_proto(self): fid.close() ## make vFloors - check_call([HCompV, '-f', F, '-C', self.cfg, - '-S', self.train_scp, - '-M', self.cur_dir, self.proto]) + check_call([self.HCompV, '-f', F, '-C', self.cfg, + '-S', self.train_scp, + '-M', self.cur_dir, self.proto]) ## make local macro # get first three lines from local proto fid = open(os.path.join(self.cur_dir, MACROS), 'w') source = open(os.path.join(self.cur_dir, - os.path.split(self.proto)[1]), 'r') + os.path.split(self.proto)[1]), 'r') for _ in range(3): fid.write(source.readline()) source.close() # get remaining lines from vFloors fid.writelines(open(os.path.join(self.cur_dir, - VFLOORS), 'r').readlines()) + VFLOORS), 'r').readlines()) fid.close() ## make hmmdefs fid = open(os.path.join(self.cur_dir, HMMDEFS), 'w') @@ -95,7 +97,7 @@ def _read_file_list(self, file_name): file_lists.append(line) fid.close() - return file_lists + return file_lists def _full_to_mono(self, full_file_name, mono_file_name, phoneme_dict): fre = open(full_file_name, 'r') @@ -137,7 +139,6 @@ def _check_data(self, file_id_list, multiple_speaker): copy_scp.write('{0} {1}\n'.format(wav_file, mfc_file)) check_scp.write('{0}\n'.format(mfc_file)) - if multiple_speaker: tmp_list = file_id.split('/') speaker_name = tmp_list[0] @@ -149,7 +150,6 @@ def _check_data(self, file_id_list, multiple_speaker): speaker_utt_dict['only_one'] = [] speaker_utt_dict['only_one'].append(mfc_file) - self._full_to_mono(lab_file, mono_lab_file, phoneme_dict) copy_scp.close() check_scp.close() @@ -168,7 +168,7 @@ def _check_data(self, file_id_list, multiple_speaker): fid.write('"*/*.lab" -> "' + self.mono_lab_dir + '"\n') fid.close() - return speaker_utt_dict + return speaker_utt_dict def _HCopy(self): """ @@ -187,7 +187,7 @@ def _HCopy(self): NUMCHANS = 20 NUMCEPS = 12 """) - check_call([HCopy, '-C', self.cfg, '-S', self.copy_scp]) + check_call([self.HCopy, '-C', self.cfg, '-S', self.copy_scp]) # write a CFG for what we just built open(self.cfg, 'w').write("""TARGETRATE = 50000.0 TARGETKIND = USER @@ -215,7 +215,7 @@ def _nxt_dir(self): def prepare_training(self, file_id_list_name, wav_dir, lab_dir, work_dir, multiple_speaker): - print('---preparing enverionment') + print '---preparing enverionment' self.cfg_dir = os.path.join(work_dir, 'config') self.model_dir = os.path.join(work_dir, 'model') self.cur_dir = os.path.join(self.model_dir, 'hmm0') @@ -235,7 +235,7 @@ def prepare_training(self, file_id_list_name, wav_dir, lab_dir, work_dir, multip # CFG self.cfg = os.path.join(self.cfg_dir, 'cfg') - self.wav_dir=wav_dir + self.wav_dir = wav_dir self.lab_dir = lab_dir self.mfc_dir = os.path.join(work_dir, 'mfc') if not os.path.exists(self.mfc_dir): @@ -246,20 +246,20 @@ def prepare_training(self, file_id_list_name, wav_dir, lab_dir, work_dir, multip os.makedirs(self.mono_lab_dir) file_id_list = self._read_file_list(file_id_list_name) - print('---checking data') + print ('---checking data') speaker_utt_dict = self._check_data(file_id_list, multiple_speaker) - print('---extracting features') + print ('---extracting features') self._HCopy() - print(time.strftime("%c")) + print (time.strftime("%c")) - print('---feature_normalisation') - normaliser = MeanVarianceNorm(39) - for key_name in list(speaker_utt_dict.keys()): + print ('---feature_normalisation') + for key_name in speaker_utt_dict.keys(): + normaliser = MeanVarianceNorm(39) normaliser.feature_normalisation(speaker_utt_dict[key_name], speaker_utt_dict[key_name]) ## save to itself - print(time.strftime("%c")) + print (time.strftime("%c")) - print('---making proto') + print ('---making proto') self._make_proto() def train_hmm(self, niter, num_mix): @@ -267,22 +267,56 @@ def train_hmm(self, niter, num_mix): Perform one or more rounds of estimation """ - print(time.strftime("%c")) - print('---training HMM models') + print (time.strftime("%c")) + print ('---training HMM models') + + # call HErest in multiple chunks + # split scp in num_splits chunks and save them + num_splits = int(os.getenv('DNN_NUM_PARALLEL', 8)) + print ("----num_splits set to %s" % num_splits) + train_scp_chunks = [] + with open(self.train_scp, "rt") as fp: + mfc_files = fp.readlines() + random.shuffle(mfc_files) + n = (len(mfc_files) + 1) / num_splits + mfc_chunks = [mfc_files[j:j + n] for j in xrange(0, len(mfc_files), n)] + for i in range(len(mfc_chunks)): + train_scp_chunks.append(os.path.join(self.cfg_dir, "train_%d.scp" % i)) + with open(train_scp_chunks[i], "wt") as fp: + fp.writelines(mfc_chunks[i]) + done = 0 mix = 1 while mix <= num_mix and done == 0: for i in range(niter): - next_dir = os.path.join(self.model_dir, 'hmm_mix_' + str(mix) + '_iter_' + str(i+1)) + next_dir = os.path.join(self.model_dir, 'hmm_mix_' + str(mix) + '_iter_' + str(i + 1)) if not os.path.exists(next_dir): os.makedirs(next_dir) - check_call([HERest, '-C', self.cfg, '-S', self.train_scp, - '-I', self.phoneme_mlf, + + procs = [] + # estimate per chunk + for chunk_num in range(len(train_scp_chunks)): + procs.append(Popen([self.HERest, '-C', self.cfg, + '-S', train_scp_chunks[chunk_num], + '-I', self.phoneme_mlf, + '-M', next_dir, + '-H', os.path.join(self.cur_dir, MACROS), + '-H', os.path.join(self.cur_dir, HMMDEFS), + '-t'] + PRUNING + ['-p', str(chunk_num + 1), self.phonemes], + stdout=PIPE)) + + # wait until all HERest calls are finished + for p in procs: + p.wait() + + # now accumulate + check_call([self.HERest, '-C', self.cfg, '-M', next_dir, '-H', os.path.join(self.cur_dir, MACROS), '-H', os.path.join(self.cur_dir, HMMDEFS), - '-t'] + PRUNING + [self.phonemes], + '-t'] + PRUNING + ['-p', '0', self.phonemes] + glob.glob(next_dir + os.sep + "*.acc"), stdout=PIPE) + self.cur_dir = next_dir if mix * 2 <= num_mix: @@ -296,10 +330,10 @@ def train_hmm(self, niter, num_mix): if not os.path.exists(next_dir): os.makedirs(next_dir) - check_call( [HHEd, '-A', - '-H', os.path.join(self.cur_dir, MACROS), - '-H', os.path.join(self.cur_dir, HMMDEFS), - '-M', next_dir] + [hed_file] + [self.phonemes]) + check_call([self.HHEd, '-A', + '-H', os.path.join(self.cur_dir, MACROS), + '-H', os.path.join(self.cur_dir, HMMDEFS), + '-M', next_dir] + [hed_file] + [self.phonemes]) self.cur_dir = next_dir mix = mix * 2 @@ -310,11 +344,11 @@ def align(self, work_dir, lab_align_dir): """ Align using the models in self.cur_dir and MLF to path """ - print('---aligning data') - print(time.strftime("%c")) + print ('---aligning data') + print (time.strftime("%c")) self.align_mlf = os.path.join(work_dir, 'mono_align.mlf') - check_call([HVite, '-a', '-f', '-m', '-y', 'lab', '-o', 'SM', + check_call([self.HVite, '-a', '-f', '-m', '-y', 'lab', '-o', 'NM', '-i', self.align_mlf, '-L', self.mono_lab_dir, '-C', self.cfg, '-S', self.train_scp, '-H', os.path.join(self.cur_dir, MACROS), @@ -328,6 +362,7 @@ def _postprocess(self, mlf, lab_align_dir): if not os.path.exists(lab_align_dir): os.makedirs(lab_align_dir) + fstats = open("logprob.txt", "wt") state_num = STATE_NUM fid = open(mlf, 'r') line = fid.readline() @@ -339,45 +374,61 @@ def _postprocess(self, mlf, lab_align_dir): line = line.replace('"', '') file_base = os.path.basename(line) flab = open(os.path.join(self.lab_dir, file_base), 'r') - fw = open(os.path.join(lab_align_dir, file_base), 'w') + fw = open(os.path.join(lab_align_dir, file_base), 'w') + lab_logprob = 0.0 + lab_entries = 0 for full_lab in flab.readlines(): full_lab = full_lab.strip() for i in range(state_num): line = fid.readline() line = line.strip() tmp_list = line.split() - fw.write('{0} {1} {2}[{3}]\n'.format(tmp_list[0], tmp_list[1], full_lab, i+2)) - + fw.write('{0} {1} {2}[{3}]\n'.format(tmp_list[0], tmp_list[1], full_lab, i + 2)) + lab_logprob += float(tmp_list[3]) + lab_entries += 1 fw.close() flab.close() + fstats.write(file_base + " " + str(lab_entries) + " " + str(lab_logprob / lab_entries)) line = fid.readline() line = line.strip() if line != '.': - print('The two files are not matched!\n') + print ('The two files are not matched!\n') sys.exit(1) fid.close() + fstats.close() if __name__ == '__main__': - - work_dir = os.getcwd() - - wav_dir = os.path.join(work_dir, 'slt_wav') - lab_dir = os.path.join(work_dir, 'label_no_align') - lab_align_dir = os.path.join(work_dir, 'label_state_align') - - file_id_list_name = os.path.join(work_dir, 'file_id_list.scp') - ## if multiple_speaker is tuned on. the file_id_list.scp has to reflact this ## for example ## speaker_1/0001 ## speaker_2/0001 ## This is to do speaker-dependent normalisation multiple_speaker = False - - aligner = ForcedAlignment() - aligner.prepare_training(file_id_list_name, wav_dir, lab_dir, work_dir, multiple_speaker) - + if '-a' in sys.argv: + sys.argv.remove('-a') + multiple_speaker = True + + # hack to keep old run_aligner scripts (with sed repalce) compatible + # they replace "HTKDIR =" and "work_dir =" + if (len(sys.argv)) > 1: + work_real_dir = sys.argv[1] + else: + work_dir = os.getcwd() + work_real_dir = work_dir + if len(sys.argv) > 2: + htk_dir = sys.argv[2] + else: + HTKDIR = None + htk_dir = HTKDIR + + wav_dir = os.path.join(work_real_dir, 'slt_wav') + lab_dir = os.path.join(work_real_dir, 'label_no_align') + lab_align_dir = os.path.join(work_real_dir, 'label_state_align') + file_id_list_name = os.path.join(work_real_dir, 'file_id_list.scp') + + aligner = ForcedAlignment(htk_dir) + aligner.prepare_training(file_id_list_name, wav_dir, lab_dir, work_real_dir, multiple_speaker) aligner.train_hmm(7, 32) - aligner.align(work_dir, lab_align_dir) - print('---done!') + aligner.align(work_real_dir, lab_align_dir) + print ('---done!')