Merge pull request #1919 from mozilla/tfdatatest

Implement input pipeline with tf.data API
mozilla · Apr 5, 2019 · 5745089 · 5745089
2 parents f458337 + 6154150
commit 5745089
Show file tree

Hide file tree

Showing 68 changed files with 814 additions and 6,835 deletions.
diff --git a/DeepSpeech.py b/DeepSpeech.py
diff --git a/GRAPH_VERSION b/GRAPH_VERSION
@@ -0,0 +1 @@
+1
diff --git a/README.md b/README.md
@@ -33,10 +33,11 @@ See the output of `deepspeech -h` for more information on the use of `deepspeech
 - [Prerequisites](#prerequisites)
 - [Getting the code](#getting-the-code)
 - [Getting the pre-trained model](#getting-the-pre-trained-model)
-- [CUDA dependency](#cuda-dependency)
 - [Using the model](#using-the-model)
+  - [CUDA dependency](#cuda-dependency)
+  - [Model compatibility](#model-compatibility)
   - [Using the Python package](#using-the-python-package)
-  - [Using the command line client](#using-the-command-line-client)
+  - [Using the command-line client](#using-the-command-line-client)
   - [Using the Node.JS package](#using-the-nodejs-package)
   - [Installing bindings from source](#installing-bindings-from-source)
   - [Third party bindings](#third-party-bindings)
@@ -48,6 +49,7 @@ See the output of `deepspeech -h` for more information on the use of `deepspeech
   - [Checkpointing](#checkpointing)
   - [Exporting a model for inference](#exporting-a-model-for-inference)
   - [Exporting a model for TFLite](#exporting-a-model-for-tflite)
+  - [Making a mmap-able model for inference](#making-a-mmap-able-model-for-inference)
   - [Continuing training from a release model](#continuing-training-from-a-release-model)
 - [Contact/Getting Help](#contactgetting-help)
 
@@ -88,6 +90,10 @@ There are three ways to use DeepSpeech inference:
 
 The GPU capable builds (Python, NodeJS, C++ etc) depend on the same CUDA runtime as upstream TensorFlow. Currently with TensorFlow r1.12 it depends on CUDA 9.0 and CuDNN v7.2.
 
+### Model compatibility
+
+DeepSpeech models are versioned to keep you from trying to use an incompatible graph with a newer client after a breaking change was made to the code. If you get an error saying your model file version is too old for the client, you should either upgrade to a newer model release, re-export your model from the checkpoint using a newer version of the code, or downgrade your client if you need to use the old model and can't re-export it.
+
 ### Using the Python package
 
 Pre-built binaries which can be used for performing inference with a trained model can be installed with `pip3`. You can then use the `deepspeech` binary to do speech-to-text on an audio file:
@@ -323,7 +329,7 @@ Refer to the corresponding [README.md](native_client/README.md) for information
 
 ### Exporting a model for TFLite
 
-If you want to experiment with the TF Lite engine, you need to export a model that is compatible with it, then use the `--export_tflite` flag. If you already have a trained model, you can re-export it for TFLite by running `DeepSpeech.py` again and specifying the same `checkpoint_dir` that you used for training, as well as passing `--notrain --notest --export_tflite --export_dir /model/export/destination`.
+If you want to experiment with the TF Lite engine, you need to export a model that is compatible with it, then use the `--nouse_seq_length --export_tflite` flags. If you already have a trained model, you can re-export it for TFLite by running `DeepSpeech.py` again and specifying the same `checkpoint_dir` that you used for training, as well as passing `--nouse_seq_length --export_tflite --export_dir /model/export/destination`.
 
 ### Making a mmap-able model for inference
 

diff --git a/bin/run-ldc93s1.sh b/bin/run-ldc93s1.sh
@@ -16,12 +16,10 @@ else
     checkpoint_dir=$(python -c 'from xdg import BaseDirectory as xdg; print(xdg.save_data_path("deepspeech/ldc93s1"))')
 fi
 
-python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
+python -u DeepSpeech.py --noshow_progressbar \
   --train_files data/ldc93s1/ldc93s1.csv \
-  --dev_files data/ldc93s1/ldc93s1.csv \
   --test_files data/ldc93s1/ldc93s1.csv \
   --train_batch_size 1 \
-  --dev_batch_size 1 \
   --test_batch_size 1 \
   --n_hidden 100 \
   --epoch 200 \

diff --git a/bin/run-tc-ldc93s1_checkpoint.sh b/bin/run-tc-ldc93s1_checkpoint.sh
@@ -16,7 +16,7 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
-  --n_hidden 494 --epoch -1 --random_seed 4567 --default_stddev 0.046875 \
+  --n_hidden 100 --epoch -1 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
   --learning_rate 0.001 --dropout_rate 0.05 \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \

diff --git a/bin/run-tc-ldc93s1_new.sh b/bin/run-tc-ldc93s1_new.sh
@@ -14,12 +14,11 @@ fi;
 
 python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
-  --train_cached_features_path "/tmp/ldc93s1.hdf5" \
+  --train_cached_features_path '/tmp/ldc93s1_cache' \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
-  --n_hidden 494 --epoch $epoch_count --random_seed 4567 \
-  --default_stddev 0.046875 --max_to_keep 1 \
-  --checkpoint_dir '/tmp/ckpt' \
+  --n_hidden 100 --epoch $epoch_count \
+  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
   --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
-  --lm_trie_path 'data/smoke_test/vocab.trie' \
+  --lm_trie_path 'data/smoke_test/vocab.trie'
diff --git a/bin/run-tc-ldc93s1_singleshotinference.sh b/bin/run-tc-ldc93s1_singleshotinference.sh
@@ -14,19 +14,15 @@ python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
-  --n_hidden 494 --epoch 1 --random_seed 4567 --default_stddev 0.046875 \
+  --n_hidden 100 --epoch 1 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
   --learning_rate 0.001 --dropout_rate 0.05 \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie'
 
-python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
-  --train_files ${ldc93s1_csv} --train_batch_size 1 \
-  --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
-  --test_files ${ldc93s1_csv} --test_batch_size 1 \
-  --n_hidden 494 --epoch 1 --random_seed 4567 --default_stddev 0.046875 \
-  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
-  --learning_rate 0.001 --dropout_rate 0.05 \
+python -u DeepSpeech.py \
+  --n_hidden 100 \
+  --checkpoint_dir '/tmp/ckpt' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' \
   --one_shot_infer 'data/smoke_test/LDC93S1.wav'
diff --git a/bin/run-tc-ldc93s1_tflite.sh b/bin/run-tc-ldc93s1_tflite.sh
@@ -11,10 +11,9 @@ if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
 fi;
 
 python -u DeepSpeech.py --noshow_progressbar \
-  --n_hidden 494 \
+  --n_hidden 100 \
   --checkpoint_dir '/tmp/ckpt' \
-  --export_dir '/tmp/train' \
+  --export_dir '/tmp/train_tflite' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' \
-  --notrain --notest \
-  --export_tflite \
+  --export_tflite --nouse_seq_length
diff --git a/evaluate.py b/evaluate.py
@@ -5,92 +5,67 @@
 import itertools
 import json
 import numpy as np
-import os
-import pandas
 import progressbar
-import sys
-import tables
 import tensorflow as tf
 
-from collections import namedtuple
 from ds_ctcdecoder import ctc_beam_search_decoder_batch, Scorer
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
 from six.moves import zip, range
-from util.audio import audiofile_to_input_vector
 from util.config import Config, initialize_globals
+from util.evaluate_tools import calculate_report
+from util.feeding import create_dataset
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
-from util.preprocess import preprocess
-from util.text import Alphabet, levenshtein
-from util.evaluate_tools import process_decode_result, calculate_report
+from util.text import levenshtein
 
-def split_data(dataset, batch_size):
-    remainder = len(dataset) % batch_size
-    if remainder != 0:
-        dataset = dataset[:-remainder]
 
-    for i in range(0, len(dataset), batch_size):
-        yield dataset[i:i + batch_size]
+def sparse_tensor_value_to_texts(value, alphabet):
+    r"""
+    Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
+    representing its values, converting tokens to strings using ``alphabet``.
+    """
+    return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet)
 
 
-def pad_to_dense(jagged):
-    maxlen = max(len(r) for r in jagged)
-    subshape = jagged[0].shape
+def sparse_tuple_to_texts(tuple, alphabet):
+    indices = tuple[0]
+    values = tuple[1]
+    results = [''] * tuple[2][0]
+    for i in range(len(indices)):
+        index = indices[i][0]
+        results[index] += alphabet.string_from_label(values[i])
+    # List of strings
+    return results
 
-    padded = np.zeros((len(jagged), maxlen) +
-                      subshape[1:], dtype=jagged[0].dtype)
-    for i, row in enumerate(jagged):
-        padded[i, :len(row)] = row
-    return padded
 
-
-def evaluate(test_data, inference_graph):
+def evaluate(test_csvs, create_model):
     scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                     FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                     Config.alphabet)
 
+    test_set, test_batches = create_dataset(test_csvs,
+                                            batch_size=FLAGS.test_batch_size,
+                                            cache_path=FLAGS.test_cached_features_path)
+    it = test_set.make_one_shot_iterator()
 
-    def create_windows(features):
-        num_strides = len(features) - (Config.n_context * 2)
+    (batch_x, batch_x_len), batch_y = it.get_next()
 
-        # Create a view into the array with overlapping strides of size
-        # numcontext (past) + 1 (present) + numcontext (future)
-        window_size = 2*Config.n_context+1
-        features = np.lib.stride_tricks.as_strided(
-            features,
-            (num_strides, window_size, Config.n_input),
-            (features.strides[0], features.strides[0], features.strides[1]),
-            writeable=False)
+    # One rate per layer
+    no_dropout = [None] * 6
+    logits, _ = create_model(batch_x=batch_x,
+                             seq_length=batch_x_len,
+                             dropout=no_dropout)
 
-        return features
+    # Transpose to batch major and apply softmax for decoder
+    transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))
 
-    # Create overlapping windows over the features
-    test_data['features'] = test_data['features'].apply(create_windows)
+    loss = tf.nn.ctc_loss(labels=batch_y,
+                          inputs=logits,
+                          sequence_length=batch_x_len)
 
     with tf.Session(config=Config.session_config) as session:
-        inputs, outputs, layers = inference_graph
-
-        # Transpose to batch major for decoder
-        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])
-
-        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
-        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")
-
-        # We add 1 to all elements of the transcript to avoid any zero values
-        # since we use that as an end-of-sequence token for converting the batch
-        # into a SparseTensor. So here we convert the placeholder back into a
-        # SparseTensor and subtract ones to get the real labels.
-        sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
-        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
-        sparse_labels = tf.sparse_add(sparse_labels, neg_ones)
-
-        loss = tf.nn.ctc_loss(labels=sparse_labels,
-                              inputs=layers['raw_logits'],
-                              sequence_length=inputs['input_lengths'])
-
         # Create a saver using variables from the above newly created graph
-        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
-        saver = tf.train.Saver(mapping)
+        saver = tf.train.Saver()
 
         # Restore variables from training checkpoint
         checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
@@ -103,51 +78,38 @@ def create_windows(features):
 
         logitses = []
         losses = []
+        seq_lengths = []
+        ground_truths = []
 
         print('Computing acoustic model predictions...')
-        batch_count = len(test_data) // FLAGS.test_batch_size
-        bar = progressbar.ProgressBar(max_value=batch_count,
+        bar = progressbar.ProgressBar(max_value=test_batches,
                                       widget=progressbar.AdaptiveETA)
 
         # First pass, compute losses and transposed logits for decoding
-        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
-            session.run(outputs['initialize_state'])
-
-            features = pad_to_dense(batch['features'].values)
-            features_len = batch['features_len'].values
-            labels = pad_to_dense(batch['transcript'].values + 1)
-            label_lengths = batch['transcript_len'].values
-
-            logits, loss_ = session.run([transposed, loss], feed_dict={
-                inputs['input']: features,
-                inputs['input_lengths']: features_len,
-                labels_ph: labels,
-                label_lengths_ph: label_lengths
-            })
+        for batch in bar(range(test_batches)):
+            logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y])
 
             logitses.append(logits)
             losses.extend(loss_)
+            seq_lengths.append(lengths)
+            ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet))
 
-    ground_truths = []
     predictions = []
 
-    print('Decoding predictions...')
-    bar = progressbar.ProgressBar(max_value=batch_count,
-                                  widget=progressbar.AdaptiveETA)
-
     # Get number of accessible CPU cores for this process
     try:
         num_processes = cpu_count()
     except:
         num_processes = 1
 
+    print('Decoding predictions...')
+    bar = progressbar.ProgressBar(max_value=test_batches,
+                                  widget=progressbar.AdaptiveETA)
+
     # Second pass, decode logits and compute WER and edit distance metrics
-    for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
-        seq_lengths = batch['features_len'].values.astype(np.int32)
-        decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, Config.alphabet, FLAGS.beam_width,
+    for logits, seq_length in bar(zip(logitses, seq_lengths)):
+        decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width,
                                                 num_processes=num_processes, scorer=scorer)
-
-        ground_truths.extend(Config.alphabet.decode(l) for l in batch['transcript'])
         predictions.extend(d[0][1] for d in decoded)
 
     distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
@@ -179,21 +141,8 @@ def main(_):
                   'the --test_files flag.')
         exit(1)
 
-    # sort examples by length, improves packing of batches and timesteps
-    test_data = preprocess(
-        FLAGS.test_files.split(','),
-        FLAGS.test_batch_size,
-        alphabet=Config.alphabet,
-        numcep=Config.n_input,
-        numcontext=Config.n_context,
-        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
-        by="features_len",
-        ascending=False)
-
-    from DeepSpeech import create_inference_graph
-    graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)
-
-    samples = evaluate(test_data, graph)
+    from DeepSpeech import create_model
+    samples = evaluate(FLAGS.test_files.split(','), create_model)
 
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats

diff --git a/native_client/BUILD b/native_client/BUILD
@@ -21,6 +21,14 @@ genrule(
     local = 1,
 )
 
+genrule(
+    name = "ds_graph_version",
+    outs = ["ds_graph_version.h"],
+    cmd = "$(location :ds_graph_version.sh) >$@",
+    tools = [":ds_graph_version.sh"],
+    local = 1,
+)
+
 KENLM_SOURCES = glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
                       "kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
                      exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"])
@@ -62,15 +70,8 @@ tf_cc_shared_object(
     srcs = ["deepspeech.cc",
             "deepspeech.h",
             "alphabet.h",
-            "c_speech_features/c_speech_features.cpp",
-            "kiss_fft130/kiss_fft.c",
-            "kiss_fft130/tools/kiss_fftr.c",
-            "c_speech_features/c_speech_features.h",
-            "c_speech_features/c_speech_features_config.h",
-            "kiss_fft130/kiss_fft.h",
-            "kiss_fft130/_kiss_fft_guts.h",
-            "kiss_fft130/tools/kiss_fftr.h",
-            "ds_version.h"] +
+            "ds_version.h",
+            "ds_graph_version.h"] +
             DECODER_SOURCES,
     copts = select({ 
         # -fvisibility=hidden is not required on Windows, MSCV hides all declarations by default
@@ -119,6 +120,10 @@ tf_cc_shared_object(
             "//tensorflow/core/kernels:control_flow_ops",   # Enter
             "//tensorflow/core/kernels:tile_ops",           # Tile
             "//tensorflow/core/kernels:gather_op",          # Gather
+            "//tensorflow/core/kernels:mfcc_op",            # Mfcc
+            "//tensorflow/core/kernels:spectrogram_op",     # AudioSpectrogram
+            "//tensorflow/core/kernels:strided_slice_op",   # StridedSlice
+            "//tensorflow/core/kernels:slice_op",           # Slice, needed by StridedSlice
             "//tensorflow/contrib/rnn:lstm_ops_kernels",    # BlockLSTM
             "//tensorflow/core/kernels:random_ops",         # RandomGammaGrad
             "//tensorflow/core/kernels:pack_op",            # Pack
@@ -130,7 +135,7 @@ tf_cc_shared_object(
     }) + if_cuda([
             "//tensorflow/core:core",
     ]),
-    includes = ["c_speech_features", "kiss_fft130"] + DECODER_INCLUDES,
+    includes = DECODER_INCLUDES,
     defines = ["KENLM_MAX_ORDER=6"],
 )