Merge pull request #3186 from lissyx/update-r0.8

Update r0.8
mozilla · Jul 27, 2020 · 03b5689 · 03b5689
2 parents 32e185f + 3bdb3fc
commit 03b5689
Show file tree

Hide file tree

Showing 80 changed files with 3,114 additions and 434 deletions.
diff --git a/bin/build_sdb.py b/bin/build_sdb.py
diff --git a/bin/data_set_tool.py b/bin/data_set_tool.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+'''
+Tool for building a combined SDB or CSV sample-set from other sets
+Use 'python3 data_set_tool.py -h' for help
+'''
+import sys
+import argparse
+import progressbar
+from pathlib import Path
+
+from deepspeech_training.util.audio import (
+    AUDIO_TYPE_PCM,
+    AUDIO_TYPE_OPUS,
+    AUDIO_TYPE_WAV,
+    change_audio_types,
+)
+from deepspeech_training.util.downloader import SIMPLE_BAR
+from deepspeech_training.util.sample_collections import (
+    CSVWriter,
+    DirectSDBWriter,
+    samples_from_sources,
+)
+from deepspeech_training.util.augmentations import (
+    parse_augmentations,
+    apply_sample_augmentations,
+    SampleAugmentation
+)
+
+AUDIO_TYPE_LOOKUP = {'wav': AUDIO_TYPE_WAV, 'opus': AUDIO_TYPE_OPUS}
+
+
+def build_data_set():
+    audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
+    augmentations = parse_augmentations(CLI_ARGS.augment)
+    if any(not isinstance(a, SampleAugmentation) for a in augmentations):
+        print('Warning: Some of the specified augmentations will not get applied, as this tool only supports '
+              'overlay, codec, reverb, resample and volume.')
+    extension = Path(CLI_ARGS.target).suffix.lower()
+    labeled = not CLI_ARGS.unlabeled
+    if extension == '.csv':
+        writer = CSVWriter(CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled)
+    elif extension == '.sdb':
+        writer = DirectSDBWriter(CLI_ARGS.target, audio_type=audio_type, labeled=labeled)
+    else:
+        print('Unknown extension of target file - has to be either .csv or .sdb')
+        sys.exit(1)
+    with writer:
+        samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
+        num_samples = len(samples)
+        if augmentations:
+            samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
+        bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
+        for sample in bar(change_audio_types(
+                samples,
+                audio_type=audio_type,
+                bitrate=CLI_ARGS.bitrate,
+                processes=CLI_ARGS.workers)):
+            writer.add(sample)
+
+
+def handle_args():
+    parser = argparse.ArgumentParser(
+        description='Tool for building a combined SDB or CSV sample-set from other sets'
+    )
+    parser.add_argument(
+        'sources',
+        nargs='+',
+        help='Source CSV and/or SDB files - '
+        'Note: For getting a correctly ordered target set, source SDBs have to have their samples '
+        'already ordered from shortest to longest.',
+    )
+    parser.add_argument(
+        'target',
+        help='SDB or CSV file to create'
+    )
+    parser.add_argument(
+        '--audio-type',
+        default='opus',
+        choices=AUDIO_TYPE_LOOKUP.keys(),
+        help='Audio representation inside target SDB',
+    )
+    parser.add_argument(
+        '--bitrate',
+        type=int,
+        help='Bitrate for lossy compressed SDB samples like in case of --audio-type opus',
+    )
+    parser.add_argument(
+        '--workers', type=int, default=None, help='Number of encoding SDB workers'
+    )
+    parser.add_argument(
+        '--unlabeled',
+        action='store_true',
+        help='If to build an SDB with unlabeled (audio only) samples - '
+        'typically used for building noise augmentation corpora',
+    )
+    parser.add_argument(
+        '--absolute-paths',
+        action='store_true',
+        help='If to reference samples by their absolute paths when writing CSV files',
+    )
+    parser.add_argument(
+        '--augment',
+        action='append',
+        help='Add an augmentation operation',
+    )
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    CLI_ARGS = handle_args()
+    build_data_set()
diff --git a/bin/play.py b/bin/play.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """
 Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files
-Use "python3 build_sdb.py -h" for help
+Use "python3 play.py -h" for help
 """
 
 import os

diff --git a/bin/run-tc-ldc93s1_checkpoint_sdb.sh b/bin/run-tc-ldc93s1_checkpoint_sdb.sh
@@ -13,7 +13,7 @@ fi;
 
 if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
     echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
-    python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
+    python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
 fi;
 
 # Force only one visible device because we have a single-sample dataset

diff --git a/bin/run-tc-ldc93s1_new_sdb.sh b/bin/run-tc-ldc93s1_new_sdb.sh
@@ -16,7 +16,7 @@ fi;
 
 if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
     echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
-    python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
+    python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
 fi;
 
 # Force only one visible device because we have a single-sample dataset

diff --git a/bin/run-tc-ldc93s1_new_sdb_csv.sh b/bin/run-tc-ldc93s1_new_sdb_csv.sh
@@ -16,7 +16,7 @@ fi;
 
 if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
     echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
-    python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
+    python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
 fi;
 
 # Force only one visible device because we have a single-sample dataset

diff --git a/doc/BUILDING.rst b/doc/BUILDING.rst
@@ -77,6 +77,20 @@ You can now use Bazel to build the main DeepSpeech library, ``libdeepspeech.so``
 
 The generated binaries will be saved to ``bazel-bin/native_client/``.
 
+.. _build-generate-scorer-package:
+
+Compile ``generate_scorer_package``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Following the same setup as for ``libdeepspeech.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``.
+Using the example from above you can build the library and that binary at the same time:
+
+.. code-block::
+
+   bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_scorer_package
+
+The generated binaries will be saved to ``bazel-bin/native_client/``.
+
 Compile Language Bindings
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/doc/Scorer.rst b/doc/Scorer.rst
@@ -49,6 +49,9 @@ Afterwards you can use ``generate_scorer_package`` to generate the scorer packag
     ./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \
       --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284
 
+The ``generate_scorer_package`` binary is part of the released ``native_client.tar.xz``. If for some reason you need to rebuild it,
+please refer to how to :ref:`build-generate-scorer-package`.
+
 Building your own scorer
 ------------------------
 

diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst
@@ -124,7 +124,7 @@ It will also add the following ``.csv`` files:
 * ``clips/dev.csv``
 * ``clips/test.csv``
 
-All entries in these CSV files refer to their samples by absolute paths. So moving this sub-directory would require another import or tweaking the CSV files accordingly.
+Entries in CSV files can refer to samples by their absolute or relative paths. Here, the importer produces relative paths.
 
 To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into ``--train_files``\ , ``--dev_files``\ , ``--test_files`` parameters of ``DeepSpeech.py``.
 
@@ -287,6 +287,8 @@ UTF-8 mode
 DeepSpeech includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`.
 
 
+.. _training-data-augmentation:
+
 Augmentation
 ^^^^^^^^^^^^
 
@@ -496,7 +498,7 @@ Example training with all augmentations:
           [...]
 
 
-The ``bin/play.py`` tool also supports ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations.
+The ``bin/play.py`` and ``bin/data_set_tool.py`` tools also support ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations or creating augmented data sets.
 
 Example of playing all samples with reverberation and maximized volume:
 
@@ -510,3 +512,12 @@ Example simulation of the codec augmentation of a wav-file first at the beginnin
 
         bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 0.0 test.wav
         bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 1.0 test.wav
+
+Example of creating a pre-augmented test set:
+
+.. code-block:: bash
+
+        bin/data_set_tool.py \
+          --augment overlay[source=noise.sdb,layers=1,snr=20~10] \
+          --augment resample[rate=12000:8000~4000] \
+          test.sdb test-augmented.sdb
diff --git a/native_client/BUILD b/native_client/BUILD
@@ -130,6 +130,7 @@ tf_cc_shared_object(
     }) + tflite_copts(),
     linkopts = select({
         "//tensorflow:macos": [],
+        "//tensorflow:ios": ["-fembed-bitcode"],
         "//tensorflow:linux_x86_64": LINUX_LINKOPTS,
         "//tensorflow:rpi3": LINUX_LINKOPTS,
         "//tensorflow:rpi3-armv8": LINUX_LINKOPTS,

diff --git a/native_client/client.cc b/native_client/client.cc
@@ -12,7 +12,11 @@
 #include <sstream>
 #include <string>
 
-#if defined(__ANDROID__) || defined(_MSC_VER)
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__ANDROID__) || defined(_MSC_VER) || TARGET_OS_IPHONE
 #define NO_SOX
 #endif
 
@@ -244,7 +248,7 @@ GetAudioBuffer(const char* path, int desired_sample_rate)
     sox_false // Reverse endianness
   };
 
-#ifdef __APPLE__
+#if TARGET_OS_OSX
   // It would be preferable to use sox_open_memstream_write here, but OS-X
   // doesn't support POSIX 2008, which it requires. See Issue #461.
   // Instead, we write to a temporary file.
@@ -348,7 +352,7 @@ GetAudioBuffer(const char* path, int desired_sample_rate)
   fclose(wave);
 #endif // NO_SOX
 
-#ifdef __APPLE__
+#if TARGET_OS_OSX
   res.buffer_size = (size_t)(output->olength * 2);
   res.buffer = (char*)malloc(sizeof(char) * res.buffer_size);
   FILE* output_file = fopen(output_name, "rb");

diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py
@@ -48,15 +48,33 @@ def __init__(self, config_path):
             raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
 
     def CanEncodeSingle(self, input):
+        '''
+        Returns true if the single character/output class has a corresponding label
+        in the alphabet.
+        '''
         return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
 
     def CanEncode(self, input):
+        '''
+        Returns true if the entire string can be encoded into labels in this
+        alphabet.
+        '''
         return super(Alphabet, self).CanEncode(input.encode('utf-8'))
 
     def EncodeSingle(self, input):
+        '''
+        Encode a single character/output class into a label. Character must be in
+        the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
+        '''
         return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
 
     def Encode(self, input):
+        '''
+        Encode a sequence of character/output classes into a sequence of labels.
+        Characters are assumed to always take a single Unicode codepoint.
+        Characters must be in the alphabet, this method will assert that. Use
+        `CanEncode` and `CanEncodeSingle` to test.
+        '''
         # Convert SWIG's UnsignedIntVec to a Python list
         res = super(Alphabet, self).Encode(input.encode('utf-8'))
         return [el for el in res]
@@ -66,6 +84,7 @@ def DecodeSingle(self, input):
         return res.decode('utf-8')
 
     def Decode(self, input):
+        '''Decode a sequence of labels into a string.'''
         res = super(Alphabet, self).Decode(input)
         return res.decode('utf-8')