Skip to content

Commit

Permalink
Merge pull request #3186 from lissyx/update-r0.8
Browse files Browse the repository at this point in the history
Update r0.8
  • Loading branch information
lissyx authored Jul 27, 2020
2 parents 32e185f + 3bdb3fc commit 03b5689
Show file tree
Hide file tree
Showing 80 changed files with 3,114 additions and 434 deletions.
92 changes: 0 additions & 92 deletions bin/build_sdb.py

This file was deleted.

111 changes: 111 additions & 0 deletions bin/data_set_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
'''
Tool for building a combined SDB or CSV sample-set from other sets
Use 'python3 data_set_tool.py -h' for help
'''
import sys
import argparse
import progressbar
from pathlib import Path

from deepspeech_training.util.audio import (
AUDIO_TYPE_PCM,
AUDIO_TYPE_OPUS,
AUDIO_TYPE_WAV,
change_audio_types,
)
from deepspeech_training.util.downloader import SIMPLE_BAR
from deepspeech_training.util.sample_collections import (
CSVWriter,
DirectSDBWriter,
samples_from_sources,
)
from deepspeech_training.util.augmentations import (
parse_augmentations,
apply_sample_augmentations,
SampleAugmentation
)

AUDIO_TYPE_LOOKUP = {'wav': AUDIO_TYPE_WAV, 'opus': AUDIO_TYPE_OPUS}


def build_data_set():
audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
augmentations = parse_augmentations(CLI_ARGS.augment)
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
print('Warning: Some of the specified augmentations will not get applied, as this tool only supports '
'overlay, codec, reverb, resample and volume.')
extension = Path(CLI_ARGS.target).suffix.lower()
labeled = not CLI_ARGS.unlabeled
if extension == '.csv':
writer = CSVWriter(CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled)
elif extension == '.sdb':
writer = DirectSDBWriter(CLI_ARGS.target, audio_type=audio_type, labeled=labeled)
else:
print('Unknown extension of target file - has to be either .csv or .sdb')
sys.exit(1)
with writer:
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
num_samples = len(samples)
if augmentations:
samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
for sample in bar(change_audio_types(
samples,
audio_type=audio_type,
bitrate=CLI_ARGS.bitrate,
processes=CLI_ARGS.workers)):
writer.add(sample)


def handle_args():
parser = argparse.ArgumentParser(
description='Tool for building a combined SDB or CSV sample-set from other sets'
)
parser.add_argument(
'sources',
nargs='+',
help='Source CSV and/or SDB files - '
'Note: For getting a correctly ordered target set, source SDBs have to have their samples '
'already ordered from shortest to longest.',
)
parser.add_argument(
'target',
help='SDB or CSV file to create'
)
parser.add_argument(
'--audio-type',
default='opus',
choices=AUDIO_TYPE_LOOKUP.keys(),
help='Audio representation inside target SDB',
)
parser.add_argument(
'--bitrate',
type=int,
help='Bitrate for lossy compressed SDB samples like in case of --audio-type opus',
)
parser.add_argument(
'--workers', type=int, default=None, help='Number of encoding SDB workers'
)
parser.add_argument(
'--unlabeled',
action='store_true',
help='If to build an SDB with unlabeled (audio only) samples - '
'typically used for building noise augmentation corpora',
)
parser.add_argument(
'--absolute-paths',
action='store_true',
help='If to reference samples by their absolute paths when writing CSV files',
)
parser.add_argument(
'--augment',
action='append',
help='Add an augmentation operation',
)
return parser.parse_args()


if __name__ == '__main__':
CLI_ARGS = handle_args()
build_data_set()
2 changes: 1 addition & 1 deletion bin/play.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
"""
Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files
Use "python3 build_sdb.py -h" for help
Use "python3 play.py -h" for help
"""

import os
Expand Down
2 changes: 1 addition & 1 deletion bin/run-tc-ldc93s1_checkpoint_sdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fi;

if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi;

# Force only one visible device because we have a single-sample dataset
Expand Down
2 changes: 1 addition & 1 deletion bin/run-tc-ldc93s1_new_sdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ fi;

if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi;

# Force only one visible device because we have a single-sample dataset
Expand Down
2 changes: 1 addition & 1 deletion bin/run-tc-ldc93s1_new_sdb_csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ fi;

if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb}
python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi;

# Force only one visible device because we have a single-sample dataset
Expand Down
14 changes: 14 additions & 0 deletions doc/BUILDING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ You can now use Bazel to build the main DeepSpeech library, ``libdeepspeech.so``
The generated binaries will be saved to ``bazel-bin/native_client/``.

.. _build-generate-scorer-package:

Compile ``generate_scorer_package``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Following the same setup as for ``libdeepspeech.so`` above, you can rebuild the ``generate_scorer_package`` binary by adding its target to the command line: ``//native_client:generate_scorer_package``.
Using the example from above you can build the library and that binary at the same time:

.. code-block::
bazel build --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_scorer_package
The generated binaries will be saved to ``bazel-bin/native_client/``.

Compile Language Bindings
^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
3 changes: 3 additions & 0 deletions doc/Scorer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Afterwards you can use ``generate_scorer_package`` to generate the scorer packag
./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \
--package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284
The ``generate_scorer_package`` binary is part of the released ``native_client.tar.xz``. If for some reason you need to rebuild it,
please refer to how to :ref:`build-generate-scorer-package`.

Building your own scorer
------------------------

Expand Down
15 changes: 13 additions & 2 deletions doc/TRAINING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ It will also add the following ``.csv`` files:
* ``clips/dev.csv``
* ``clips/test.csv``

All entries in these CSV files refer to their samples by absolute paths. So moving this sub-directory would require another import or tweaking the CSV files accordingly.
Entries in CSV files can refer to samples by their absolute or relative paths. Here, the importer produces relative paths.

To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into ``--train_files``\ , ``--dev_files``\ , ``--test_files`` parameters of ``DeepSpeech.py``.

Expand Down Expand Up @@ -287,6 +287,8 @@ UTF-8 mode
DeepSpeech includes a UTF-8 operating mode which can be useful to model languages with very large alphabets, such as Chinese Mandarin. For details on how it works and how to use it, see :ref:`decoder-docs`.


.. _training-data-augmentation:

Augmentation
^^^^^^^^^^^^

Expand Down Expand Up @@ -496,7 +498,7 @@ Example training with all augmentations:
[...]
The ``bin/play.py`` tool also supports ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations.
The ``bin/play.py`` and ``bin/data_set_tool.py`` tools also support ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations or creating augmented data sets.

Example of playing all samples with reverberation and maximized volume:

Expand All @@ -510,3 +512,12 @@ Example simulation of the codec augmentation of a wav-file first at the beginnin
bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 0.0 test.wav
bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 1.0 test.wav
Example of creating a pre-augmented test set:

.. code-block:: bash
bin/data_set_tool.py \
--augment overlay[source=noise.sdb,layers=1,snr=20~10] \
--augment resample[rate=12000:8000~4000] \
test.sdb test-augmented.sdb
1 change: 1 addition & 0 deletions native_client/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ tf_cc_shared_object(
}) + tflite_copts(),
linkopts = select({
"//tensorflow:macos": [],
"//tensorflow:ios": ["-fembed-bitcode"],
"//tensorflow:linux_x86_64": LINUX_LINKOPTS,
"//tensorflow:rpi3": LINUX_LINKOPTS,
"//tensorflow:rpi3-armv8": LINUX_LINKOPTS,
Expand Down
10 changes: 7 additions & 3 deletions native_client/client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
#include <sstream>
#include <string>

#if defined(__ANDROID__) || defined(_MSC_VER)
#ifdef __APPLE__
#include <TargetConditionals.h>
#endif

#if defined(__ANDROID__) || defined(_MSC_VER) || TARGET_OS_IPHONE
#define NO_SOX
#endif

Expand Down Expand Up @@ -244,7 +248,7 @@ GetAudioBuffer(const char* path, int desired_sample_rate)
sox_false // Reverse endianness
};

#ifdef __APPLE__
#if TARGET_OS_OSX
// It would be preferable to use sox_open_memstream_write here, but OS-X
// doesn't support POSIX 2008, which it requires. See Issue #461.
// Instead, we write to a temporary file.
Expand Down Expand Up @@ -348,7 +352,7 @@ GetAudioBuffer(const char* path, int desired_sample_rate)
fclose(wave);
#endif // NO_SOX

#ifdef __APPLE__
#if TARGET_OS_OSX
res.buffer_size = (size_t)(output->olength * 2);
res.buffer = (char*)malloc(sizeof(char) * res.buffer_size);
FILE* output_file = fopen(output_name, "rb");
Expand Down
19 changes: 19 additions & 0 deletions native_client/ctcdecode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,33 @@ def __init__(self, config_path):
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))

def CanEncodeSingle(self, input):
'''
Returns true if the single character/output class has a corresponding label
in the alphabet.
'''
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))

def CanEncode(self, input):
'''
Returns true if the entire string can be encoded into labels in this
alphabet.
'''
return super(Alphabet, self).CanEncode(input.encode('utf-8'))

def EncodeSingle(self, input):
'''
Encode a single character/output class into a label. Character must be in
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
'''
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))

def Encode(self, input):
'''
Encode a sequence of character/output classes into a sequence of labels.
Characters are assumed to always take a single Unicode codepoint.
Characters must be in the alphabet, this method will assert that. Use
`CanEncode` and `CanEncodeSingle` to test.
'''
# Convert SWIG's UnsignedIntVec to a Python list
res = super(Alphabet, self).Encode(input.encode('utf-8'))
return [el for el in res]
Expand All @@ -66,6 +84,7 @@ def DecodeSingle(self, input):
return res.decode('utf-8')

def Decode(self, input):
'''Decode a sequence of labels into a string.'''
res = super(Alphabet, self).Decode(input)
return res.decode('utf-8')

Expand Down
Loading

0 comments on commit 03b5689

Please sign in to comment.