-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwhisperx_align.py
80 lines (44 loc) · 1.99 KB
/
whisperx_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import whisperx
import pandas as pd
import gc
import os
from scipy.io.wavfile import write
from tqdm import tqdm
DEVICE="cuda"
BATCH_SIZE=16
COMPUTE_TYPE="float16"
def align_whisperx(dataset_path,output_path):
model=whisperx.load_model("whisper-tiny-ct2", DEVICE, compute_type=COMPUTE_TYPE)
df=pd.read_csv(dataset_path)
df=df[:10000]
os.makedirs(output_path,exist_ok=True)
root_dir=os.path.dirname(dataset_path)
output_metadat_path=os.path.join(output_path,"metadata.csv")
aligned_list=[]
for i in tqdm(range(len(df))):
audio_name=df['audio_name'][i]
audio_path=os.path.join(root_dir,audio_name)
audio=whisperx.load_audio(audio_path)
result=model.transcribe(audio,batch_size=BATCH_SIZE)
model_a, metadata = whisperx.load_align_model(language_code='hi', device=DEVICE)
result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)
segments=result['segments'][0]['words']
for i,seg in enumerate(segments):
#print(i)
#print(seg)
word_label=seg['word']
if 'start' in seg.keys():
start_frame=int(seg['start']*16000)
end_frame=int(seg['end']*16000)
audio_seg=audio[start_frame:end_frame]
save_path=os.path.join(output_path,f'{audio_name}_seg{i}.wav')
write(save_path,16000,audio_seg)
dict={'audio_path':f'{audio_name}_seg{i}.wav','transcript':word_label}
aligned_list.append(dict)
meta_df=pd.DataFrame(aligned_list)
meta_df.to_csv(output_metadat_path)
print("Aligned sucessfully")
if __name__=='__main__':
dataset_path='/root/suyash/acoustic_stuff/hindi-acoustic-word-embedding/train_dataset/metadata.csv'
output_path='/root/suyash/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_whisper'
align_whisperx(dataset_path=dataset_path,output_path=output_path)