Skip to content

Commit

Permalink
Update Frame-VAD doc and fix onnx export (#7076)
Browse files Browse the repository at this point in the history
* update fvad doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update fvad example

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix onnx export

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update test

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: jubick1337 <mattyson.so@gmail.com>
  • Loading branch information
2 people authored and jubick1337 committed Aug 8, 2023
1 parent c3a55f9 commit ba16234
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 11 deletions.
3 changes: 2 additions & 1 deletion examples/asr/conf/vad/frame_vad_infer_postprocess.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: &name "vad_inference_postprocessing"

dataset: null # Path of json file of evaluation data. Audio files should have unique names
input_manifest: null # Path of json file of evaluation data. Audio files should have unique names
output_dir: null # Path to output directory where results will be stored
num_workers: 12
sample_rate: 16000
evaluate: False # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled
Expand Down
24 changes: 18 additions & 6 deletions examples/asr/speech_classification/frame_vad_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
## Usage:
python frame_vad_infer.py \
--config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \
dataset=<Path of manifest file containing evaluation data. Audio files should have unique names>
input_manifest=<Path of manifest file containing evaluation data. Audio files should have unique names> \
output_dir=<Path of output directory>
The manifest json file should have the following format (each line is a Python dictionary):
{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000}
Expand Down Expand Up @@ -58,15 +59,25 @@

@hydra_runner(config_path="../conf/vad", config_name="frame_vad_infer_postprocess")
def main(cfg):
if not cfg.dataset:
if not cfg.input_manifest:
raise ValueError("You must input the path of json file of evaluation data")
output_dir = cfg.output_dir if cfg.output_dir else "frame_vad_outputs"
if os.path.exists(output_dir):
logging.warning(
f"Output directory {output_dir} already exists, use this only if you're tuning post-processing params."
)
Path(output_dir).mkdir(parents=True, exist_ok=True)

cfg.frame_out_dir = os.path.join(output_dir, "frame_preds")
cfg.smoothing_out_dir = os.path.join(output_dir, "smoothing_preds")
cfg.rttm_out_dir = os.path.join(output_dir, "rttm_preds")

# each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions
logging.info(f"Loading manifest file {cfg.dataset}")
# each line of input_manifest should be have different audio_filepath and unique name to simplify edge cases or conditions
logging.info(f"Loading manifest file {cfg.input_manifest}")
manifest_orig, key_labels_map, key_rttm_map = frame_vad_infer_load_manifest(cfg)

# Prepare manifest for streaming VAD
manifest_vad_input = cfg.dataset
manifest_vad_input = cfg.input_manifest
if cfg.prepare_manifest.auto_split:
logging.info("Split long audio file to avoid CUDA memory issue")
logging.debug("Try smaller split_duration if you still have CUDA memory issue")
Expand All @@ -76,6 +87,7 @@ def main(cfg):
'split_duration': cfg.prepare_manifest.split_duration,
'num_workers': cfg.num_workers,
'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input,
'out_dir': output_dir,
}
manifest_vad_input = prepare_manifest(config)
else:
Expand Down Expand Up @@ -171,7 +183,7 @@ def main(cfg):
key_pred_rttm_map[key] = entry['rttm_filepath']

if not cfg.out_manifest_filepath:
out_manifest_filepath = "manifest_vad_output.json"
out_manifest_filepath = os.path.join(output_dir, "manifest_vad_output.json")
else:
out_manifest_filepath = cfg.out_manifest_filepath
write_manifest(out_manifest_filepath, manifest_new)
Expand Down
43 changes: 43 additions & 0 deletions nemo/collections/asr/models/classification_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from nemo.core.classes.common import PretrainedModelInfo, typecheck
from nemo.core.neural_types import *
from nemo.utils import logging, model_utils
from nemo.utils.cast_utils import cast_all

__all__ = ['EncDecClassificationModel', 'EncDecRegressionModel']

Expand Down Expand Up @@ -851,6 +852,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
self.eval_loop_cnt = 0
self.ratio_threshold = cfg.get('ratio_threshold', 0.2)
super().__init__(cfg=cfg, trainer=trainer)
self.decoder.output_types = self.output_types
self.decoder.output_types_for_export = self.output_types

@classmethod
def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]:
Expand Down Expand Up @@ -1148,3 +1151,43 @@ def get_metric_logits_labels(self, logits, labels, masks):
labels = labels.gather(dim=0, index=idx.view(-1))

return logits, labels

def forward_for_export(
self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
):
"""
This forward is used when we need to export the model to ONNX format.
Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
Args:
input: Tensor that represents a batch of raw audio signals,
of shape [B, T]. T here represents timesteps.
length: Vector of length B, that contains the individual lengths of the audio sequences.
cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
N is the number of such layers which need caching, B is batch size, H is the hidden size of activations,
and T is the length of the cache
Returns:
the output of the model
"""
enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward)
if cache_last_channel is None:
encoder_output = enc_fun(audio_signal=input, length=length)
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
else:
encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun(
audio_signal=input,
length=length,
cache_last_channel=cache_last_channel,
cache_last_time=cache_last_time,
cache_last_channel_len=cache_last_channel_len,
)

dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward)
ret = dec_fun(hidden_states=encoder_output.transpose(1, 2))
if isinstance(ret, tuple):
ret = ret[0]
if cache_last_channel is not None:
ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len)
return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32)
6 changes: 4 additions & 2 deletions nemo/collections/asr/parts/utils/vad_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,9 @@ def generate_overlap_vad_seq(
if out_dir:
overlap_out_dir = out_dir
else:
overlap_out_dir = frame_pred_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
overlap_out_dir = os.path.join(
frame_pred_dir, "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
)

if not os.path.exists(overlap_out_dir):
os.mkdir(overlap_out_dir)
Expand Down Expand Up @@ -732,7 +734,7 @@ def generate_vad_segment_table(
if not out_dir:
out_dir_name = "seg_output_"
for key in postprocessing_params:
out_dir_name = out_dir_name + str(key) + str(postprocessing_params[key]) + "-"
out_dir_name = out_dir_name + "-" + str(key) + str(postprocessing_params[key])

out_dir = os.path.join(vad_pred_dir, out_dir_name)

Expand Down
4 changes: 2 additions & 2 deletions tests/collections/asr/test_asr_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def frame_classification_model():
}

decoder = {
'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification',
'params': {'feat_in': 32, 'num_classes': 5,},
'cls': 'nemo.collections.common.parts.MultiLayerPerceptron',
'params': {'hidden_size': 32, 'num_classes': 5,},
}

modelConfig = DictConfig(
Expand Down

0 comments on commit ba16234

Please sign in to comment.