From 03ca94887c62cff6de755cd86d5b6c76c83fe1a0 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 26 Jun 2020 10:23:46 +0200 Subject: [PATCH 1/9] Move DS_ErrorCodeToErrorMessage impl to its own object so it can be used without including all of libdeepspeech --- native_client/BUILD | 7 ++++--- native_client/deepspeech.cc | 17 ----------------- native_client/deepspeech_errors.cc | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 20 deletions(-) create mode 100644 native_client/deepspeech_errors.cc diff --git a/native_client/BUILD b/native_client/BUILD index 53711dc2a6..965a766cec 100644 --- a/native_client/BUILD +++ b/native_client/BUILD @@ -89,13 +89,14 @@ cc_library( tf_cc_shared_object( name = "libdeepspeech.so", srcs = [ + "alphabet.h", "deepspeech.cc", "deepspeech.h", - "alphabet.h", - "modelstate.h", + "deepspeech_errors.cc", "modelstate.cc", - "workspace_status.h", + "modelstate.h", "workspace_status.cc", + "workspace_status.h", ] + select({ "//native_client:tflite": [ "tflitemodelstate.h", diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc index 3bcecc602c..38868d4b5f 100644 --- a/native_client/deepspeech.cc +++ b/native_client/deepspeech.cc @@ -501,20 +501,3 @@ DS_Version() { return strdup(ds_version()); } - -char* -DS_ErrorCodeToErrorMessage(int aErrorCode) -{ -#define RETURN_MESSAGE(NAME, VALUE, DESC) \ - case NAME: \ - return strdup(DESC); - - switch(aErrorCode) - { - DS_FOR_EACH_ERROR(RETURN_MESSAGE) - default: - return strdup("Unknown error, please make sure you are using the correct native binary."); - } - -#undef RETURN_MESSAGE -} diff --git a/native_client/deepspeech_errors.cc b/native_client/deepspeech_errors.cc new file mode 100644 index 0000000000..1f1e4d8d15 --- /dev/null +++ b/native_client/deepspeech_errors.cc @@ -0,0 +1,19 @@ +#include "deepspeech.h" +#include + +char* +DS_ErrorCodeToErrorMessage(int aErrorCode) +{ +#define RETURN_MESSAGE(NAME, VALUE, DESC) \ + case NAME: \ + return strdup(DESC); + + switch(aErrorCode) + { + DS_FOR_EACH_ERROR(RETURN_MESSAGE) + default: + return strdup("Unknown error, please make sure you are using the correct native binary."); + } + +#undef RETURN_MESSAGE +} From f82c77392de3e6dbf8c28471adb7db3b6ab83937 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 26 Jun 2020 10:27:35 +0200 Subject: [PATCH 2/9] Rewrite data/lm/generate_package.py into native_client/generate_scorer_package.cpp --- data/lm/generate_package.py | 157 ---------------------- native_client/BUILD | 19 +++ native_client/alphabet.h | 45 ++++++- native_client/ctcdecode/scorer.cpp | 2 +- native_client/ctcdecode/scorer.h | 3 +- native_client/generate_scorer_package.cpp | 146 ++++++++++++++++++++ 6 files changed, 211 insertions(+), 161 deletions(-) delete mode 100644 data/lm/generate_package.py create mode 100644 native_client/generate_scorer_package.cpp diff --git a/data/lm/generate_package.py b/data/lm/generate_package.py deleted file mode 100644 index 30a33fcc7e..0000000000 --- a/data/lm/generate_package.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -from __future__ import absolute_import, division, print_function - -import argparse -import shutil -import sys - -import ds_ctcdecoder -from deepspeech_training.util.text import Alphabet, UTF8Alphabet -from ds_ctcdecoder import Scorer, Alphabet as NativeAlphabet - - -def create_bundle( - alphabet_path, - lm_path, - vocab_path, - package_path, - force_utf8, - default_alpha, - default_beta, -): - words = set() - vocab_looks_char_based = True - with open(vocab_path) as fin: - for line in fin: - for word in line.split(): - words.add(word.encode("utf-8")) - if len(word) > 1: - vocab_looks_char_based = False - print("{} unique words read from vocabulary file.".format(len(words))) - - cbm = "Looks" if vocab_looks_char_based else "Doesn't look" - print("{} like a character based model.".format(cbm)) - - if force_utf8 != None: # pylint: disable=singleton-comparison - use_utf8 = force_utf8.value - else: - use_utf8 = vocab_looks_char_based - print("Using detected UTF-8 mode: {}".format(use_utf8)) - - if use_utf8: - serialized_alphabet = UTF8Alphabet().serialize() - else: - if not alphabet_path: - raise RuntimeError("No --alphabet path specified, can't continue.") - serialized_alphabet = Alphabet(alphabet_path).serialize() - - alphabet = NativeAlphabet() - err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) - if err != 0: - raise RuntimeError("Error loading alphabet: {}".format(err)) - - scorer = Scorer() - scorer.set_alphabet(alphabet) - scorer.set_utf8_mode(use_utf8) - scorer.reset_params(default_alpha, default_beta) - err = scorer.load_lm(lm_path) - if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE: - print('Error loading language model file: 0x{:X}.'.format(err)) - print('See the error codes section in https://deepspeech.readthedocs.io for a description.') - sys.exit(1) - scorer.fill_dictionary(list(words)) - shutil.copy(lm_path, package_path) - # append, not overwrite - if scorer.save_dictionary(package_path, True): - print("Package created in {}".format(package_path)) - else: - print("Error when creating {}".format(package_path)) - sys.exit(1) - - -class Tristate(object): - def __init__(self, value=None): - if any(value is v for v in (True, False, None)): - self.value = value - else: - raise ValueError("Tristate value must be True, False, or None") - - def __eq__(self, other): - return ( - self.value is other.value - if isinstance(other, Tristate) - else self.value is other - ) - - def __ne__(self, other): - return not self == other - - def __bool__(self): - raise TypeError("Tristate object may not be used as a Boolean") - - def __str__(self): - return str(self.value) - - def __repr__(self): - return "Tristate(%s)" % self.value - - -def main(): - parser = argparse.ArgumentParser( - description="Generate an external scorer package for DeepSpeech." - ) - parser.add_argument( - "--alphabet", - help="Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.", - ) - parser.add_argument( - "--lm", - required=True, - help="Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.", - ) - parser.add_argument( - "--vocab", - required=True, - help="Path of vocabulary file. Must contain words separated by whitespace.", - ) - parser.add_argument("--package", required=True, help="Path to save scorer package.") - parser.add_argument( - "--default_alpha", - type=float, - required=True, - help="Default value of alpha hyperparameter.", - ) - parser.add_argument( - "--default_beta", - type=float, - required=True, - help="Default value of beta hyperparameter.", - ) - parser.add_argument( - "--force_utf8", - type=str, - default="", - help="Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation", - ) - args = parser.parse_args() - - if args.force_utf8 in ("True", "1", "true", "yes", "y"): - force_utf8 = Tristate(True) - elif args.force_utf8 in ("False", "0", "false", "no", "n"): - force_utf8 = Tristate(False) - else: - force_utf8 = Tristate(None) - - create_bundle( - args.alphabet, - args.lm, - args.vocab, - args.package, - force_utf8, - args.default_alpha, - args.default_beta, - ) - - -if __name__ == "__main__": - main() diff --git a/native_client/BUILD b/native_client/BUILD index 965a766cec..36702088d0 100644 --- a/native_client/BUILD +++ b/native_client/BUILD @@ -2,6 +2,7 @@ load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cc_shared_object") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") load( "@org_tensorflow//tensorflow/lite:build_def.bzl", @@ -78,6 +79,8 @@ cc_library( hdrs = [ "ctcdecode/ctc_beam_search_decoder.h", "ctcdecode/scorer.h", + "ctcdecode/decoder_utils.h", + "alphabet.h", ], includes = [ ".", @@ -186,6 +189,22 @@ genrule( cmd = "dsymutil $(location :libdeepspeech.so) -o $@" ) +cc_binary( + name = "generate_scorer_package", + srcs = [ + "generate_scorer_package.cpp", + "deepspeech_errors.cc", + ], + copts = ["-std=c++11"], + deps = [ + ":decoder", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/flags:parse", + "@com_google_absl//absl/types:optional", + "@boost//:program_options", + ], +) + cc_binary( name = "enumerate_kenlm_vocabulary", srcs = [ diff --git a/native_client/alphabet.h b/native_client/alphabet.h index ace905ccde..e57ef91446 100644 --- a/native_client/alphabet.h +++ b/native_client/alphabet.h @@ -19,7 +19,7 @@ class Alphabet { Alphabet(const Alphabet&) = default; Alphabet& operator=(const Alphabet&) = default; - int init(const char *config_file) { + virtual int init(const char *config_file) { std::ifstream in(config_file, std::ios::in); if (!in) { return 1; @@ -45,6 +45,30 @@ class Alphabet { return 0; } + std::string serialize() { + // Serialization format is a sequence of (key, value) pairs, where key is + // a uint16_t and value is a uint16_t length followed by `length` UTF-8 + // encoded bytes with the label. + std::stringstream out; + + // We start by writing the number of pairs in the buffer as uint16_t. + uint16_t size = size_; + out.write(reinterpret_cast(&size), sizeof(size)); + + for (auto it = label_to_str_.begin(); it != label_to_str_.end(); ++it) { + uint16_t key = it->first; + string str = it->second; + uint16_t len = str.length(); + // Then we write the key as uint16_t, followed by the length of the value + // as uint16_t, followed by `length` bytes (the value itself). + out.write(reinterpret_cast(&key), sizeof(key)); + out.write(reinterpret_cast(&len), sizeof(len)); + out.write(str.data(), len); + } + + return out.str(); + } + int deserialize(const char* buffer, const int buffer_size) { // See util/text.py for an explanation of the serialization format. int offset = 0; @@ -126,11 +150,28 @@ class Alphabet { return word; } -private: +protected: size_t size_; unsigned int space_label_; std::unordered_map label_to_str_; std::unordered_map str_to_label_; }; +class UTF8Alphabet : public Alphabet +{ +public: + UTF8Alphabet() { + size_ = 255; + space_label_ = ' ' - 1; + for (int i = 0; i < size_; ++i) { + std::string val(1, i+1); + label_to_str_[i] = val; + str_to_label_[val] = i; + } + } + + int init(const char*) override {} +}; + + #endif //ALPHABET_H diff --git a/native_client/ctcdecode/scorer.cpp b/native_client/ctcdecode/scorer.cpp index ebf5522763..401613d131 100644 --- a/native_client/ctcdecode/scorer.cpp +++ b/native_client/ctcdecode/scorer.cpp @@ -357,7 +357,7 @@ std::vector Scorer::make_ngram(PathTrie* prefix) return ngram; } -void Scorer::fill_dictionary(const std::vector& vocabulary) +void Scorer::fill_dictionary(const std::unordered_set& vocabulary) { // ConstFst is immutable, so we need to use a MutableFst to create the trie, // and then we convert to a ConstFst for the decoder and for storing on disk. diff --git a/native_client/ctcdecode/scorer.h b/native_client/ctcdecode/scorer.h index d2a1c8b3be..3e7c076102 100644 --- a/native_client/ctcdecode/scorer.h +++ b/native_client/ctcdecode/scorer.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "lm/virtual_interface.hh" @@ -83,7 +84,7 @@ class Scorer { bool is_scoring_boundary(PathTrie* prefix, size_t new_label); // fill dictionary FST from a vocabulary - void fill_dictionary(const std::vector &vocabulary); + void fill_dictionary(const std::unordered_set &vocabulary); // load language model from given path int load_lm(const std::string &lm_path); diff --git a/native_client/generate_scorer_package.cpp b/native_client/generate_scorer_package.cpp new file mode 100644 index 0000000000..910bf9c234 --- /dev/null +++ b/native_client/generate_scorer_package.cpp @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include +using namespace std; + +#include "absl/types/optional.h" +#include "boost/program_options.hpp" + +#include "ctcdecode/decoder_utils.h" +#include "ctcdecode/scorer.h" +#include "alphabet.h" +#include "deepspeech.h" + +namespace po = boost::program_options; + +int +create_package(absl::optional alphabet_path, + string lm_path, + string vocab_path, + string package_path, + absl::optional force_utf8, + float default_alpha, + float default_beta) +{ + // Read vocabulary + unordered_set words; + bool vocab_looks_char_based = true; + ifstream fin(vocab_path); + if (!fin) { + cerr << "Invalid vocabulary file " << vocab_path << "\n"; + return 1; + } + string word; + while (fin >> word) { + words.insert(word); + if (get_utf8_str_len(word) > 1) { + vocab_looks_char_based = false; + } + } + cerr << words.size() << " unique words read from vocabulary file.\n" + << (vocab_looks_char_based ? "Looks" : "Doesn't look") + << " like a character based (Bytes Are All You Need) model.\n"; + + if (!force_utf8.has_value()) { + force_utf8 = vocab_looks_char_based; + cerr << "--force_utf8 was not specified, using value " + << "infered from vocabulary contents: " + << (vocab_looks_char_based ? "true" : "false") << "\n"; + } + + if (force_utf8.value() && !alphabet_path.has_value()) { + cerr << "No --alphabet file specified, not using bytes output mode, can't continue.\n"; + return 1; + } + + Scorer scorer; + if (force_utf8.value()) { + scorer.set_alphabet(UTF8Alphabet()); + } else { + Alphabet alphabet; + alphabet.init(alphabet_path->c_str()); + scorer.set_alphabet(alphabet); + } + scorer.set_utf8_mode(force_utf8.value()); + scorer.reset_params(default_alpha, default_beta); + int err = scorer.load_lm(lm_path); + if (err != DS_ERR_SCORER_NO_TRIE) { + cerr << "Error loading language model file: " + << DS_ErrorCodeToErrorMessage(err) << "\n"; + return 1; + } + scorer.fill_dictionary(words); + + // Copy LM file to final package file destination + { + ifstream lm_src(lm_path, std::ios::binary); + ofstream package_dest(package_path, std::ios::binary); + package_dest << lm_src.rdbuf(); + } + + // Save dictionary to package file, appending instead of overwriting + if (!scorer.save_dictionary(package_path, true)) { + cerr << "Error when saving package in " << package_path << ".\n"; + return 1; + } + + cerr << "Package created in " << package_path << ".\n"; + return 0; +} + +int +main(int argc, char** argv) +{ + po::options_description desc("Options"); + desc.add_options() + ("help", "show help message") + ("alphabet", po::value(), "Path of alphabet file to use for vocabulary construction. Words with characters not in the alphabet will not be included in the vocabulary. Optional if using UTF-8 mode.") + ("lm", po::value(), "Path of KenLM binary LM file. Must be built without including the vocabulary (use the -v flag). See generate_lm.py for how to create a binary LM.") + ("vocab", po::value(), "Path of vocabulary file. Must contain words separated by whitespace.") + ("package", po::value(), "Path to save scorer package.") + ("default_alpha", po::value(), "Default value of alpha hyperparameter (float).") + ("default_beta", po::value(), "Default value of beta hyperparameter (float).") + ("force_utf8", po::value(), "Boolean flag, force set or unset UTF-8 mode in the scorer package. If not set, infers from the vocabulary. See for further explanation.") + ; + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << "\n"; + return 1; + } + + // Check required flags. + for (const string& flag : {"lm", "vocab", "package", "default_alpha", "default_beta"}) { + if (!vm.count(flag)) { + cerr << "--" << flag << " is a required flag. Pass --help for help.\n"; + return 1; + } + } + + // Parse optional --force_utf8 + absl::optional force_utf8 = absl::nullopt; + if (vm.count("force_utf8")) { + force_utf8 = vm["force_utf8"].as(); + } + + // Parse optional --alphabet + absl::optional alphabet = absl::nullopt; + if (vm.count("alphabet")) { + alphabet = vm["alphabet"].as(); + } + + create_package(alphabet, + vm["lm"].as(), + vm["vocab"].as(), + vm["package"].as(), + force_utf8, + vm["default_alpha"].as(), + vm["default_beta"].as()); + + return 0; +} From a84abf813cd0cbc3257429aa44d00a3552f98f71 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 26 Jun 2020 11:57:10 +0200 Subject: [PATCH 3/9] Deduplicate Alphabet implementations, use C++ one everywhere --- native_client/BUILD | 19 ++- native_client/alphabet.cc | 154 +++++++++++++++++++ native_client/alphabet.h | 146 ++++-------------- native_client/ctcdecode/__init__.py | 42 ++--- native_client/ctcdecode/build_archive.py | 3 +- native_client/ctcdecode/decoder_utils.cpp | 4 +- native_client/ctcdecode/decoder_utils.h | 2 +- native_client/ctcdecode/output.h | 4 +- native_client/ctcdecode/path_trie.cpp | 18 +-- native_client/ctcdecode/path_trie.h | 18 +-- native_client/ctcdecode/scorer.cpp | 12 +- native_client/ctcdecode/scorer.h | 2 +- native_client/ctcdecode/swigwrapper.i | 9 +- native_client/modelstate.cc | 4 +- native_client/tflitemodelstate.cc | 2 +- native_client/tfmodelstate.cc | 2 +- tests/test_text.py | 6 +- training/deepspeech_training/evaluate.py | 2 +- training/deepspeech_training/train.py | 2 +- training/deepspeech_training/util/config.py | 5 +- training/deepspeech_training/util/helpers.py | 6 +- training/deepspeech_training/util/text.py | 117 +------------- 22 files changed, 257 insertions(+), 322 deletions(-) create mode 100644 native_client/alphabet.cc diff --git a/native_client/BUILD b/native_client/BUILD index 36702088d0..232d99c776 100644 --- a/native_client/BUILD +++ b/native_client/BUILD @@ -75,6 +75,7 @@ cc_library( "ctcdecode/scorer.cpp", "ctcdecode/path_trie.cpp", "ctcdecode/path_trie.h", + "alphabet.cc", ] + OPENFST_SOURCES_PLATFORM, hdrs = [ "ctcdecode/ctc_beam_search_decoder.h", @@ -86,13 +87,17 @@ cc_library( ".", "ctcdecode/third_party/ThreadPool", ] + OPENFST_INCLUDES_PLATFORM, - deps = [":kenlm"] + deps = [":kenlm"], + linkopts = [ + "-lm", + "-ldl", + "-pthread", + ], ) tf_cc_shared_object( name = "libdeepspeech.so", srcs = [ - "alphabet.h", "deepspeech.cc", "deepspeech.h", "deepspeech_errors.cc", @@ -203,6 +208,11 @@ cc_binary( "@com_google_absl//absl/types:optional", "@boost//:program_options", ], + linkopts = [ + "-lm", + "-ldl", + "-pthread", + ], ) cc_binary( @@ -221,10 +231,5 @@ cc_binary( "trie_load.cc", ], copts = ["-std=c++11"], - linkopts = [ - "-lm", - "-ldl", - "-pthread", - ], deps = [":decoder"], ) diff --git a/native_client/alphabet.cc b/native_client/alphabet.cc new file mode 100644 index 0000000000..873b4881be --- /dev/null +++ b/native_client/alphabet.cc @@ -0,0 +1,154 @@ +#include "alphabet.h" +#include "ctcdecode/decoder_utils.h" + +#include + +int +Alphabet::init(const char *config_file) +{ + std::ifstream in(config_file, std::ios::in); + if (!in) { + return 1; + } + unsigned int label = 0; + space_label_ = -2; + for (std::string line; std::getline(in, line);) { + if (line.size() == 2 && line[0] == '\\' && line[1] == '#') { + line = '#'; + } else if (line[0] == '#') { + continue; + } + //TODO: we should probably do something more i18n-aware here + if (line == " ") { + space_label_ = label; + } + label_to_str_[label] = line; + str_to_label_[line] = label; + ++label; + } + size_ = label; + in.close(); + return 0; +} + +std::string +Alphabet::Serialize() +{ + // Serialization format is a sequence of (key, value) pairs, where key is + // a uint16_t and value is a uint16_t length followed by `length` UTF-8 + // encoded bytes with the label. + std::stringstream out; + + // We start by writing the number of pairs in the buffer as uint16_t. + uint16_t size = size_; + out.write(reinterpret_cast(&size), sizeof(size)); + + for (auto it = label_to_str_.begin(); it != label_to_str_.end(); ++it) { + uint16_t key = it->first; + string str = it->second; + uint16_t len = str.length(); + // Then we write the key as uint16_t, followed by the length of the value + // as uint16_t, followed by `length` bytes (the value itself). + out.write(reinterpret_cast(&key), sizeof(key)); + out.write(reinterpret_cast(&len), sizeof(len)); + out.write(str.data(), len); + } + + return out.str(); +} + +int +Alphabet::Deserialize(const char* buffer, const int buffer_size) +{ + // See util/text.py for an explanation of the serialization format. + int offset = 0; + if (buffer_size - offset < sizeof(uint16_t)) { + return 1; + } + uint16_t size = *(uint16_t*)(buffer + offset); + offset += sizeof(uint16_t); + size_ = size; + + for (int i = 0; i < size; ++i) { + if (buffer_size - offset < sizeof(uint16_t)) { + return 1; + } + uint16_t label = *(uint16_t*)(buffer + offset); + offset += sizeof(uint16_t); + + if (buffer_size - offset < sizeof(uint16_t)) { + return 1; + } + uint16_t val_len = *(uint16_t*)(buffer + offset); + offset += sizeof(uint16_t); + + if (buffer_size - offset < val_len) { + return 1; + } + std::string val(buffer+offset, val_len); + offset += val_len; + + label_to_str_[label] = val; + str_to_label_[val] = label; + + if (val == " ") { + space_label_ = label; + } + } + + return 0; +} + +std::string +Alphabet::DecodeSingle(unsigned int label) const +{ + auto it = label_to_str_.find(label); + if (it != label_to_str_.end()) { + return it->second; + } else { + std::cerr << "Invalid label " << label << std::endl; + abort(); + } +} + +unsigned int +Alphabet::EncodeSingle(const std::string& string) const +{ + auto it = str_to_label_.find(string); + if (it != str_to_label_.end()) { + return it->second; + } else { + std::cerr << "Invalid string " << string << std::endl; + abort(); + } +} + +std::string +Alphabet::Decode(const std::vector& input) const +{ + std::string word; + for (auto ind : input) { + word += DecodeSingle(ind); + } + return word; +} + +std::string +Alphabet::Decode(const unsigned int* input, int length) const +{ + std::string word; + for (int i = 0; i < length; ++i) { + word += DecodeSingle(input[i]); + } + return word; +} + +std::vector +Alphabet::Encode(const std::string& input) const +{ + std::vector result; + for (auto cp : split_into_codepoints(input)) { + result.push_back(EncodeSingle(cp)); + } + return result; +} diff --git a/native_client/alphabet.h b/native_client/alphabet.h index e57ef91446..45fc444e5c 100644 --- a/native_client/alphabet.h +++ b/native_client/alphabet.h @@ -1,9 +1,6 @@ #ifndef ALPHABET_H #define ALPHABET_H -#include -#include -#include #include #include #include @@ -18,116 +15,15 @@ class Alphabet { Alphabet() = default; Alphabet(const Alphabet&) = default; Alphabet& operator=(const Alphabet&) = default; + virtual ~Alphabet() = default; - virtual int init(const char *config_file) { - std::ifstream in(config_file, std::ios::in); - if (!in) { - return 1; - } - unsigned int label = 0; - space_label_ = -2; - for (std::string line; std::getline(in, line);) { - if (line.size() == 2 && line[0] == '\\' && line[1] == '#') { - line = '#'; - } else if (line[0] == '#') { - continue; - } - //TODO: we should probably do something more i18n-aware here - if (line == " ") { - space_label_ = label; - } - label_to_str_[label] = line; - str_to_label_[line] = label; - ++label; - } - size_ = label; - in.close(); - return 0; - } - - std::string serialize() { - // Serialization format is a sequence of (key, value) pairs, where key is - // a uint16_t and value is a uint16_t length followed by `length` UTF-8 - // encoded bytes with the label. - std::stringstream out; - - // We start by writing the number of pairs in the buffer as uint16_t. - uint16_t size = size_; - out.write(reinterpret_cast(&size), sizeof(size)); - - for (auto it = label_to_str_.begin(); it != label_to_str_.end(); ++it) { - uint16_t key = it->first; - string str = it->second; - uint16_t len = str.length(); - // Then we write the key as uint16_t, followed by the length of the value - // as uint16_t, followed by `length` bytes (the value itself). - out.write(reinterpret_cast(&key), sizeof(key)); - out.write(reinterpret_cast(&len), sizeof(len)); - out.write(str.data(), len); - } - - return out.str(); - } - - int deserialize(const char* buffer, const int buffer_size) { - // See util/text.py for an explanation of the serialization format. - int offset = 0; - if (buffer_size - offset < sizeof(uint16_t)) { - return 1; - } - uint16_t size = *(uint16_t*)(buffer + offset); - offset += sizeof(uint16_t); - size_ = size; - - for (int i = 0; i < size; ++i) { - if (buffer_size - offset < sizeof(uint16_t)) { - return 1; - } - uint16_t label = *(uint16_t*)(buffer + offset); - offset += sizeof(uint16_t); - - if (buffer_size - offset < sizeof(uint16_t)) { - return 1; - } - uint16_t val_len = *(uint16_t*)(buffer + offset); - offset += sizeof(uint16_t); - - if (buffer_size - offset < val_len) { - return 1; - } - std::string val(buffer+offset, val_len); - offset += val_len; - - label_to_str_[label] = val; - str_to_label_[val] = label; - - if (val == " ") { - space_label_ = label; - } - } - - return 0; - } + virtual int init(const char *config_file); - const std::string& StringFromLabel(unsigned int label) const { - auto it = label_to_str_.find(label); - if (it != label_to_str_.end()) { - return it->second; - } else { - std::cerr << "Invalid label " << label << std::endl; - abort(); - } - } + // Serialize alphabet into a binary buffer. + std::string Serialize(); - unsigned int LabelFromString(const std::string& string) const { - auto it = str_to_label_.find(string); - if (it != str_to_label_.end()) { - return it->second; - } else { - std::cerr << "Invalid string " << string << std::endl; - abort(); - } - } + // Deserialize alphabet from a binary buffer. + int Deserialize(const char* buffer, const int buffer_size); size_t GetSize() const { return size_; @@ -141,14 +37,22 @@ class Alphabet { return space_label_; } - template - std::string LabelsToString(const std::vector& input) const { - std::string word; - for (auto ind : input) { - word += StringFromLabel(ind); - } - return word; - } + // Decode a single label into a string. + std::string DecodeSingle(unsigned int label) const; + + // Encode a single character/output class into a label. + unsigned int EncodeSingle(const std::string& string) const; + + // Decode a sequence of labels into a string. + std::string Decode(const std::vector& input) const; + + // We provide a C-style overload for accepting NumPy arrays as input, since + // the NumPy library does not have built-in typemaps for std::vector. + std::string Decode(const unsigned int* input, int length) const; + + // Encode a sequence of character/output classes into a sequence of labels. + // Characters are assumed to always take a single Unicode codepoint. + std::vector Encode(const std::string& input) const; protected: size_t size_; @@ -163,14 +67,16 @@ class UTF8Alphabet : public Alphabet UTF8Alphabet() { size_ = 255; space_label_ = ' ' - 1; - for (int i = 0; i < size_; ++i) { + for (size_t i = 0; i < size_; ++i) { std::string val(1, i+1); label_to_str_[i] = val; str_to_label_[val] = i; } } - int init(const char*) override {} + int init(const char*) override { + return 0; + } }; diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 7e3766bebf..ac603aa9ba 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, print_function from . import swigwrapper # pylint: disable=import-self -from .swigwrapper import Alphabet +from .swigwrapper import UTF8Alphabet __version__ = swigwrapper.__version__ @@ -30,24 +30,20 @@ def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None): assert beta is not None, 'beta parameter is required' assert scorer_path, 'scorer_path parameter is required' - serialized = alphabet.serialize() - native_alphabet = swigwrapper.Alphabet() - err = native_alphabet.deserialize(serialized, len(serialized)) + err = self.init(scorer_path, alphabet) if err != 0: - raise ValueError('Error when deserializing alphabet.') - - err = self.init(scorer_path.encode('utf-8'), - native_alphabet) - if err != 0: - raise ValueError('Scorer initialization failed with error code {}'.format(err)) + raise ValueError('Scorer initialization failed with error code 0x{:X}'.format(err)) self.reset_params(alpha, beta) - def load_lm(self, lm_path): - return super(Scorer, self).load_lm(lm_path.encode('utf-8')) - def save_dictionary(self, save_path, *args, **kwargs): - return super(Scorer, self).save_dictionary(save_path.encode('utf-8'), *args, **kwargs) +class Alphabet(swigwrapper.Alphabet): + """Convenience wrapper for Alphabet which calls init in the constructor""" + def __init__(self, config_path): + super(Alphabet, self).__init__() + err = self.init(config_path) + if err != 0: + raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err)) def ctc_beam_search_decoder(probs_seq, @@ -79,15 +75,10 @@ def ctc_beam_search_decoder(probs_seq, results, in descending order of the confidence. :rtype: list """ - serialized = alphabet.serialize() - native_alphabet = swigwrapper.Alphabet() - err = native_alphabet.deserialize(serialized, len(serialized)) - if err != 0: - raise ValueError("Error when deserializing alphabet.") beam_results = swigwrapper.ctc_beam_search_decoder( - probs_seq, native_alphabet, beam_size, cutoff_prob, cutoff_top_n, + probs_seq, alphabet, beam_size, cutoff_prob, cutoff_top_n, scorer) - beam_results = [(res.confidence, alphabet.decode(res.tokens)) for res in beam_results] + beam_results = [(res.confidence, alphabet.Decode(res.tokens)) for res in beam_results] return beam_results @@ -126,14 +117,9 @@ def ctc_beam_search_decoder_batch(probs_seq, results, in descending order of the confidence. :rtype: list """ - serialized = alphabet.serialize() - native_alphabet = swigwrapper.Alphabet() - err = native_alphabet.deserialize(serialized, len(serialized)) - if err != 0: - raise ValueError("Error when deserializing alphabet.") - batch_beam_results = swigwrapper.ctc_beam_search_decoder_batch(probs_seq, seq_lengths, native_alphabet, beam_size, num_processes, cutoff_prob, cutoff_top_n, scorer) + batch_beam_results = swigwrapper.ctc_beam_search_decoder_batch(probs_seq, seq_lengths, alphabet, beam_size, num_processes, cutoff_prob, cutoff_top_n, scorer) batch_beam_results = [ - [(res.confidence, alphabet.decode(res.tokens)) for res in beam_results] + [(res.confidence, alphabet.Decode(res.tokens)) for res in beam_results] for beam_results in batch_beam_results ] return batch_beam_results diff --git a/native_client/ctcdecode/build_archive.py b/native_client/ctcdecode/build_archive.py index c379d6b376..8a689ac0cd 100644 --- a/native_client/ctcdecode/build_archive.py +++ b/native_client/ctcdecode/build_archive.py @@ -46,7 +46,8 @@ 'scorer.cpp', 'path_trie.cpp', 'decoder_utils.cpp', - 'workspace_status.cc' + 'workspace_status.cc', + '../alphabet.cc', ] def build_archive(srcs=[], out_name='', build_dir='temp_build/temp_build', debug=False, num_parallel=1): diff --git a/native_client/ctcdecode/decoder_utils.cpp b/native_client/ctcdecode/decoder_utils.cpp index ed244c3a7a..bb3e1c77a1 100644 --- a/native_client/ctcdecode/decoder_utils.cpp +++ b/native_client/ctcdecode/decoder_utils.cpp @@ -119,7 +119,7 @@ bool prefix_compare_external(const PathTrie *x, const PathTrie *y, const std::un } } -void add_word_to_fst(const std::vector &word, +void add_word_to_fst(const std::vector &word, fst::StdVectorFst *dictionary) { if (dictionary->NumStates() == 0) { fst::StdVectorFst::StateId start = dictionary->AddState(); @@ -144,7 +144,7 @@ bool add_word_to_dictionary( fst::StdVectorFst *dictionary) { auto characters = utf8 ? split_into_bytes(word) : split_into_codepoints(word); - std::vector int_word; + std::vector int_word; for (auto &c : characters) { auto int_c = char_map.find(c); diff --git a/native_client/ctcdecode/decoder_utils.h b/native_client/ctcdecode/decoder_utils.h index 3ba1d7e60d..c51ea046ac 100644 --- a/native_client/ctcdecode/decoder_utils.h +++ b/native_client/ctcdecode/decoder_utils.h @@ -86,7 +86,7 @@ std::vector split_into_codepoints(const std::string &str); std::vector split_into_bytes(const std::string &str); // Add a word in index to the dicionary of fst -void add_word_to_fst(const std::vector &word, +void add_word_to_fst(const std::vector &word, fst::StdVectorFst *dictionary); // Return whether a byte is a code point boundary (not a continuation byte). diff --git a/native_client/ctcdecode/output.h b/native_client/ctcdecode/output.h index 10eb4228a6..bdfc8ee9dd 100644 --- a/native_client/ctcdecode/output.h +++ b/native_client/ctcdecode/output.h @@ -8,8 +8,8 @@ */ struct Output { double confidence; - std::vector tokens; - std::vector timesteps; + std::vector tokens; + std::vector timesteps; }; #endif // OUTPUT_H_ diff --git a/native_client/ctcdecode/path_trie.cpp b/native_client/ctcdecode/path_trie.cpp index 0c0ee98cfb..7a04f693c5 100644 --- a/native_client/ctcdecode/path_trie.cpp +++ b/native_client/ctcdecode/path_trie.cpp @@ -35,7 +35,7 @@ PathTrie::~PathTrie() { } } -PathTrie* PathTrie::get_path_trie(int new_char, int new_timestep, float cur_log_prob_c, bool reset) { +PathTrie* PathTrie::get_path_trie(unsigned int new_char, unsigned int new_timestep, float cur_log_prob_c, bool reset) { auto child = children_.begin(); for (; child != children_.end(); ++child) { if (child->first == new_char) { @@ -102,7 +102,7 @@ PathTrie* PathTrie::get_path_trie(int new_char, int new_timestep, float cur_log_ } } -void PathTrie::get_path_vec(std::vector& output, std::vector& timesteps) { +void PathTrie::get_path_vec(std::vector& output, std::vector& timesteps) { // Recursive call: recurse back until stop condition, then append data in // correct order as we walk back down the stack in the lines below. if (parent != nullptr) { @@ -114,8 +114,8 @@ void PathTrie::get_path_vec(std::vector& output, std::vector& timestep } } -PathTrie* PathTrie::get_prev_grapheme(std::vector& output, - std::vector& timesteps, +PathTrie* PathTrie::get_prev_grapheme(std::vector& output, + std::vector& timesteps, const Alphabet& alphabet) { PathTrie* stop = this; @@ -124,7 +124,7 @@ PathTrie* PathTrie::get_prev_grapheme(std::vector& output, } // Recursive call: recurse back until stop condition, then append data in // correct order as we walk back down the stack in the lines below. - if (!byte_is_codepoint_boundary(alphabet.StringFromLabel(character)[0])) { + if (!byte_is_codepoint_boundary(alphabet.DecodeSingle(character)[0])) { stop = parent->get_prev_grapheme(output, timesteps, alphabet); } output.push_back(character); @@ -135,7 +135,7 @@ PathTrie* PathTrie::get_prev_grapheme(std::vector& output, int PathTrie::distance_to_codepoint_boundary(unsigned char *first_byte, const Alphabet& alphabet) { - if (byte_is_codepoint_boundary(alphabet.StringFromLabel(character)[0])) { + if (byte_is_codepoint_boundary(alphabet.DecodeSingle(character)[0])) { *first_byte = (unsigned char)character + 1; return 1; } @@ -146,8 +146,8 @@ int PathTrie::distance_to_codepoint_boundary(unsigned char *first_byte, return 0; } -PathTrie* PathTrie::get_prev_word(std::vector& output, - std::vector& timesteps, +PathTrie* PathTrie::get_prev_word(std::vector& output, + std::vector& timesteps, const Alphabet& alphabet) { PathTrie* stop = this; @@ -225,7 +225,7 @@ void PathTrie::print(const Alphabet& a) { for (PathTrie* el : chain) { printf("%X ", (unsigned char)(el->character)); if (el->character != ROOT_) { - tr.append(a.StringFromLabel(el->character)); + tr.append(a.DecodeSingle(el->character)); } } printf("\ntimesteps:\t "); diff --git a/native_client/ctcdecode/path_trie.h b/native_client/ctcdecode/path_trie.h index dbd8a2337a..0a4374fc56 100644 --- a/native_client/ctcdecode/path_trie.h +++ b/native_client/ctcdecode/path_trie.h @@ -21,22 +21,22 @@ class PathTrie { ~PathTrie(); // get new prefix after appending new char - PathTrie* get_path_trie(int new_char, int new_timestep, float log_prob_c, bool reset = true); + PathTrie* get_path_trie(unsigned int new_char, unsigned int new_timestep, float log_prob_c, bool reset = true); // get the prefix data in correct time order from root to current node - void get_path_vec(std::vector& output, std::vector& timesteps); + void get_path_vec(std::vector& output, std::vector& timesteps); // get the prefix data in correct time order from beginning of last grapheme to current node - PathTrie* get_prev_grapheme(std::vector& output, - std::vector& timesteps, + PathTrie* get_prev_grapheme(std::vector& output, + std::vector& timesteps, const Alphabet& alphabet); // get the distance from current node to the first codepoint boundary, and the byte value at the boundary int distance_to_codepoint_boundary(unsigned char *first_byte, const Alphabet& alphabet); // get the prefix data in correct time order from beginning of last word to current node - PathTrie* get_prev_word(std::vector& output, - std::vector& timesteps, + PathTrie* get_prev_word(std::vector& output, + std::vector& timesteps, const Alphabet& alphabet); // update log probs @@ -64,8 +64,8 @@ class PathTrie { float log_prob_c; float score; float approx_ctc; - int character; - int timestep; + unsigned int character; + unsigned int timestep; PathTrie* parent; private: @@ -73,7 +73,7 @@ class PathTrie { bool exists_; bool has_dictionary_; - std::vector> children_; + std::vector> children_; // pointer to dictionary of FST std::shared_ptr dictionary_; diff --git a/native_client/ctcdecode/scorer.cpp b/native_client/ctcdecode/scorer.cpp index 401613d131..a6616e2178 100644 --- a/native_client/ctcdecode/scorer.cpp +++ b/native_client/ctcdecode/scorer.cpp @@ -65,7 +65,7 @@ void Scorer::setup_char_map() // The initial state of FST is state 0, hence the index of chars in // the FST should start from 1 to avoid the conflict with the initial // state, otherwise wrong decoding results would be given. - char_map_[alphabet_.StringFromLabel(i)] = i + 1; + char_map_[alphabet_.DecodeSingle(i)] = i + 1; } } @@ -314,11 +314,11 @@ void Scorer::reset_params(float alpha, float beta) this->beta = beta; } -std::vector Scorer::split_labels_into_scored_units(const std::vector& labels) +std::vector Scorer::split_labels_into_scored_units(const std::vector& labels) { if (labels.empty()) return {}; - std::string s = alphabet_.LabelsToString(labels); + std::string s = alphabet_.Decode(labels); std::vector words; if (is_utf8_mode_) { words = split_into_codepoints(s); @@ -339,8 +339,8 @@ std::vector Scorer::make_ngram(PathTrie* prefix) break; } - std::vector prefix_vec; - std::vector prefix_steps; + std::vector prefix_vec; + std::vector prefix_steps; if (is_utf8_mode_) { new_node = current_node->get_prev_grapheme(prefix_vec, prefix_steps, alphabet_); @@ -350,7 +350,7 @@ std::vector Scorer::make_ngram(PathTrie* prefix) current_node = new_node->parent; // reconstruct word - std::string word = alphabet_.LabelsToString(prefix_vec); + std::string word = alphabet_.Decode(prefix_vec); ngram.push_back(word); } std::reverse(ngram.begin(), ngram.end()); diff --git a/native_client/ctcdecode/scorer.h b/native_client/ctcdecode/scorer.h index 3e7c076102..13c2ef1f09 100644 --- a/native_client/ctcdecode/scorer.h +++ b/native_client/ctcdecode/scorer.h @@ -73,7 +73,7 @@ class Scorer { // trransform the labels in index to the vector of words (word based lm) or // the vector of characters (character based lm) - std::vector split_labels_into_scored_units(const std::vector &labels); + std::vector split_labels_into_scored_units(const std::vector &labels); void set_alphabet(const Alphabet& alphabet); diff --git a/native_client/ctcdecode/swigwrapper.i b/native_client/ctcdecode/swigwrapper.i index ab5675be32..ffe23c3a2e 100644 --- a/native_client/ctcdecode/swigwrapper.i +++ b/native_client/ctcdecode/swigwrapper.i @@ -3,7 +3,6 @@ %{ #include "ctc_beam_search_decoder.h" #define SWIG_FILE_WITH_INIT -#define SWIG_PYTHON_STRICT_BYTE_CHAR #include "workspace_status.h" %} @@ -19,6 +18,9 @@ import_array(); namespace std { %template(StringVector) vector; + %template(UnsignedIntVector) vector; + %template(OutputVector) vector; + %template(OutputVectorVector) vector>; } %shared_ptr(Scorer); @@ -27,6 +29,7 @@ namespace std { %apply (double* IN_ARRAY2, int DIM1, int DIM2) {(const double *probs, int time_dim, int class_dim)}; %apply (double* IN_ARRAY3, int DIM1, int DIM2, int DIM3) {(const double *probs, int batch_size, int time_dim, int class_dim)}; %apply (int* IN_ARRAY1, int DIM1) {(const int *seq_lengths, int seq_lengths_size)}; +%apply (unsigned int* IN_ARRAY1, int DIM1) {(const unsigned int *input, int length)}; %ignore Scorer::dictionary; @@ -38,10 +41,6 @@ namespace std { %constant const char* __version__ = ds_version(); %constant const char* __git_version__ = ds_git_version(); -%template(IntVector) std::vector; -%template(OutputVector) std::vector; -%template(OutputVectorVector) std::vector>; - // Import only the error code enum definitions from deepspeech.h // We can't just do |%ignore "";| here because it affects this file globally (even // files %include'd above). That causes SWIG to lose destructor information and diff --git a/native_client/modelstate.cc b/native_client/modelstate.cc index 3cb06ac275..d8637c3656 100644 --- a/native_client/modelstate.cc +++ b/native_client/modelstate.cc @@ -33,7 +33,7 @@ char* ModelState::decode(const DecoderState& state) const { vector out = state.decode(); - return strdup(alphabet_.LabelsToString(out[0].tokens).c_str()); + return strdup(alphabet_.Decode(out[0].tokens).c_str()); } Metadata* @@ -50,7 +50,7 @@ ModelState::decode_metadata(const DecoderState& state, for (int j = 0; j < out[i].tokens.size(); ++j) { TokenMetadata token { - strdup(alphabet_.StringFromLabel(out[i].tokens[j]).c_str()), // text + strdup(alphabet_.DecodeSingle(out[i].tokens[j]).c_str()), // text static_cast(out[i].timesteps[j]), // timestep out[i].timesteps[j] * ((float)audio_win_step_ / sample_rate_), // start_time }; diff --git a/native_client/tflitemodelstate.cc b/native_client/tflitemodelstate.cc index 4836ed0b50..12be0d3770 100644 --- a/native_client/tflitemodelstate.cc +++ b/native_client/tflitemodelstate.cc @@ -206,7 +206,7 @@ TFLiteModelState::init(const char* model_path) beam_width_ = (unsigned int)(*beam_width); tflite::StringRef serialized_alphabet = tflite::GetString(interpreter_->tensor(metadata_alphabet_idx), 0); - err = alphabet_.deserialize(serialized_alphabet.str, serialized_alphabet.len); + err = alphabet_.Deserialize(serialized_alphabet.str, serialized_alphabet.len); if (err != 0) { return DS_ERR_INVALID_ALPHABET; } diff --git a/native_client/tfmodelstate.cc b/native_client/tfmodelstate.cc index 440c44e602..65328e308a 100644 --- a/native_client/tfmodelstate.cc +++ b/native_client/tfmodelstate.cc @@ -119,7 +119,7 @@ TFModelState::init(const char* model_path) beam_width_ = (unsigned int)(beam_width); string serialized_alphabet = metadata_outputs[4].scalar()(); - err = alphabet_.deserialize(serialized_alphabet.data(), serialized_alphabet.size()); + err = alphabet_.Deserialize(serialized_alphabet.data(), serialized_alphabet.size()); if (err != 0) { return DS_ERR_INVALID_ALPHABET; } diff --git a/tests/test_text.py b/tests/test_text.py index b26fda94d8..5bdda19ef6 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -1,7 +1,7 @@ import unittest import os -from deepspeech_training.util.text import Alphabet +from ds_ctcdecoder import Alphabet class TestAlphabetParsing(unittest.TestCase): @@ -11,12 +11,12 @@ def _ending_tester(self, file, expected): label_id = -1 for expected_label, expected_label_id in expected: try: - label_id = alphabet.encode(expected_label) + label_id = alphabet.Encode(expected_label) except KeyError: pass self.assertEqual(label_id, [expected_label_id]) try: - label = alphabet.decode([expected_label_id]) + label = alphabet.Decode([expected_label_id]) except KeyError: pass self.assertEqual(label, expected_label) diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py index 716b5f9339..00eac8c7dc 100755 --- a/training/deepspeech_training/evaluate.py +++ b/training/deepspeech_training/evaluate.py @@ -40,7 +40,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): for i, index in enumerate(indices): results[index[0]].append(values[i]) # List of strings - return [alphabet.decode(res) for res in results] + return [alphabet.Decode(res) for res in results] def evaluate(test_csvs, create_model): diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py index 175032da52..93d0c7275e 100644 --- a/training/deepspeech_training/train.py +++ b/training/deepspeech_training/train.py @@ -771,7 +771,7 @@ def export(): outputs['metadata_feature_win_len'] = tf.constant([FLAGS.feature_win_len], name='metadata_feature_win_len') outputs['metadata_feature_win_step'] = tf.constant([FLAGS.feature_win_step], name='metadata_feature_win_step') outputs['metadata_beam_width'] = tf.constant([FLAGS.export_beam_width], name='metadata_beam_width') - outputs['metadata_alphabet'] = tf.constant([Config.alphabet.serialize()], name='metadata_alphabet') + outputs['metadata_alphabet'] = tf.constant([Config.alphabet.Serialize()], name='metadata_alphabet') if FLAGS.export_language: outputs['metadata_language'] = tf.constant([FLAGS.export_language.encode('utf-8')], name='metadata_language') diff --git a/training/deepspeech_training/util/config.py b/training/deepspeech_training/util/config.py index 13a362201d..2bd580b54d 100755 --- a/training/deepspeech_training/util/config.py +++ b/training/deepspeech_training/util/config.py @@ -6,14 +6,15 @@ from attrdict import AttrDict from xdg import BaseDirectory as xdg +from ds_ctcdecoder import Alphabet, UTF8Alphabet from .flags import FLAGS from .gpu import get_available_gpus from .logging import log_error, log_warn -from .text import Alphabet, UTF8Alphabet from .helpers import parse_file_size from .augmentations import parse_augmentations + class ConfigSingleton: _config = None @@ -115,7 +116,7 @@ def initialize_globals(): c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one - c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label + c.n_hidden_6 = c.alphabet.GetSize() + 1 # +1 for CTC blank label # Size of audio window in samples if (FLAGS.feature_win_len * FLAGS.audio_sample_rate) % 1000 != 0: diff --git a/training/deepspeech_training/util/helpers.py b/training/deepspeech_training/util/helpers.py index 6da708b90d..32116f3f00 100644 --- a/training/deepspeech_training/util/helpers.py +++ b/training/deepspeech_training/util/helpers.py @@ -52,12 +52,10 @@ def check_ctcdecoder_version(): sys.exit(1) raise e - decoder_version_s = decoder_version.decode() - - rv = semver.compare(ds_version_s, decoder_version_s) + rv = semver.compare(ds_version_s, decoder_version) if rv != 0: print("DeepSpeech version ({}) and CTC decoder version ({}) do not match. " - "Please ensure matching versions are in use.".format(ds_version_s, decoder_version_s)) + "Please ensure matching versions are in use.".format(ds_version_s, decoder_version)) sys.exit(1) return rv diff --git a/training/deepspeech_training/util/text.py b/training/deepspeech_training/util/text.py index 60bfe9f122..e1c2e981e0 100644 --- a/training/deepspeech_training/util/text.py +++ b/training/deepspeech_training/util/text.py @@ -3,121 +3,6 @@ import numpy as np import struct -from six.moves import range - -class Alphabet(object): - def __init__(self, config_file): - self._config_file = config_file - self._label_to_str = {} - self._str_to_label = {} - self._size = 0 - if config_file: - with open(config_file, 'r', encoding='utf-8') as fin: - for line in fin: - if line[0:2] == '\\#': - line = '#\n' - elif line[0] == '#': - continue - self._label_to_str[self._size] = line[:-1] # remove the line ending - self._str_to_label[line[:-1]] = self._size - self._size += 1 - - def _string_from_label(self, label): - return self._label_to_str[label] - - def _label_from_string(self, string): - try: - return self._str_to_label[string] - except KeyError as e: - raise KeyError( - 'ERROR: Your transcripts contain characters (e.g. \'{}\') which do not occur in \'{}\'! Use ' \ - 'util/check_characters.py to see what characters are in your [train,dev,test].csv transcripts, and ' \ - 'then add all these to \'{}\'.'.format(string, self._config_file, self._config_file) - ).with_traceback(e.__traceback__) - - def has_char(self, char): - return char in self._str_to_label - - def encode(self, string): - res = [] - for char in string: - res.append(self._label_from_string(char)) - return res - - def decode(self, labels): - res = '' - for label in labels: - res += self._string_from_label(label) - return res - - def serialize(self): - # Serialization format is a sequence of (key, value) pairs, where key is - # a uint16_t and value is a uint16_t length followed by `length` UTF-8 - # encoded bytes with the label. - res = bytearray() - - # We start by writing the number of pairs in the buffer as uint16_t. - res += struct.pack(' Date: Sun, 28 Jun 2020 16:52:08 +0200 Subject: [PATCH 4/9] Build/package/publish generate_scorer_package in CI --- native_client/Android.mk | 2 +- taskcluster/android-build.sh | 1 + taskcluster/arm64-build.sh | 1 + taskcluster/cuda-build.sh | 1 + taskcluster/host-build.sh | 1 + taskcluster/rpi3-build.sh | 1 + taskcluster/tc-package.sh | 2 ++ taskcluster/win-build.sh | 1 + 8 files changed, 9 insertions(+), 1 deletion(-) diff --git a/native_client/Android.mk b/native_client/Android.mk index d21551fd1c..ee91a7a493 100644 --- a/native_client/Android.mk +++ b/native_client/Android.mk @@ -2,7 +2,7 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := deepspeech-prebuilt -LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so +LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so $(TFDIR)/bazel-bin/native_client/generate_scorer_package include $(PREBUILT_SHARED_LIBRARY) include $(CLEAR_VARS) diff --git a/taskcluster/android-build.sh b/taskcluster/android-build.sh index 0121e51d84..83d8eecb0d 100644 --- a/taskcluster/android-build.sh +++ b/taskcluster/android-build.sh @@ -10,6 +10,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " if [ "${arm_flavor}" = "armeabi-v7a" ]; then diff --git a/taskcluster/arm64-build.sh b/taskcluster/arm64-build.sh index 7cfb7abf38..1ca4028ec6 100644 --- a/taskcluster/arm64-build.sh +++ b/taskcluster/arm64-build.sh @@ -8,6 +8,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " BAZEL_BUILD_FLAGS="${BAZEL_ARM64_FLAGS} ${BAZEL_EXTRA_FLAGS}" diff --git a/taskcluster/cuda-build.sh b/taskcluster/cuda-build.sh index dfaa236ac9..f8213f8104 100755 --- a/taskcluster/cuda-build.sh +++ b/taskcluster/cuda-build.sh @@ -8,6 +8,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " BAZEL_ENV_FLAGS="TF_NEED_CUDA=1 ${TF_CUDA_FLAGS}" diff --git a/taskcluster/host-build.sh b/taskcluster/host-build.sh index ddbc90fcf9..9ff3648c2d 100755 --- a/taskcluster/host-build.sh +++ b/taskcluster/host-build.sh @@ -10,6 +10,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " if [ "${runtime}" = "tflite" ]; then diff --git a/taskcluster/rpi3-build.sh b/taskcluster/rpi3-build.sh index e9b795a430..eabff73091 100755 --- a/taskcluster/rpi3-build.sh +++ b/taskcluster/rpi3-build.sh @@ -8,6 +8,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " BAZEL_BUILD_FLAGS="${BAZEL_ARM_FLAGS} ${BAZEL_EXTRA_FLAGS}" diff --git a/taskcluster/tc-package.sh b/taskcluster/tc-package.sh index 039979911d..ec36bb846f 100755 --- a/taskcluster/tc-package.sh +++ b/taskcluster/tc-package.sh @@ -24,6 +24,7 @@ package_native_client() ${TAR} -cf - \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so \ -C ${tensorflow_dir}/bazel-bin/native_client/ libdeepspeech.so.if.lib \ + -C ${tensorflow_dir}/bazel-bin/native_client/ generate_scorer_package \ -C ${deepspeech_dir}/ LICENSE \ -C ${deepspeech_dir}/native_client/ deepspeech${PLATFORM_EXE_SUFFIX} \ -C ${deepspeech_dir}/native_client/ deepspeech.h \ @@ -56,6 +57,7 @@ package_native_client_ndk() tar -cf - \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ deepspeech \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ libdeepspeech.so \ + -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ generate_scorer_package \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ libc++_shared.so \ -C ${deepspeech_dir}/native_client/ deepspeech.h \ -C ${deepspeech_dir}/ LICENSE \ diff --git a/taskcluster/win-build.sh b/taskcluster/win-build.sh index f5951db9c7..48c7a19ffc 100755 --- a/taskcluster/win-build.sh +++ b/taskcluster/win-build.sh @@ -10,6 +10,7 @@ source $(dirname "$0")/tf_tc-vars.sh BAZEL_TARGETS=" //native_client:libdeepspeech.so +//native_client:generate_scorer_package " if [ "${package_option}" = "--cuda" ]; then From 6618148e9bf0ec003ef4223cd136e6e205ac292f Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Sun, 28 Jun 2020 17:00:27 +0200 Subject: [PATCH 5/9] Update tensorflow with Boost rules --- taskcluster/.build.yml | 2 +- taskcluster/.shared.yml | 36 ++++++++++++++++++------------------ tensorflow | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/taskcluster/.build.yml b/taskcluster/.build.yml index 7fda06e61c..fd261359c9 100644 --- a/taskcluster/.build.yml +++ b/taskcluster/.build.yml @@ -25,7 +25,7 @@ build: nc_asset_name: 'native_client.tar.xz' args: tests_cmdline: '' - tensorflow_git_desc: 'TensorFlow: v2.2.0-14-g7ead558' + tensorflow_git_desc: 'TensorFlow: v2.2.0-15-g518c1d0' test_model_task: '' homebrew: url: '' diff --git a/taskcluster/.shared.yml b/taskcluster/.shared.yml index ce031b2952..46cd698375 100644 --- a/taskcluster/.shared.yml +++ b/taskcluster/.shared.yml @@ -142,32 +142,32 @@ system: namespace: "project.deepspeech.swig.win.amd64.b5fea54d39832d1d132d7dd921b69c0c2c9d5118" tensorflow: linux_amd64_cpu: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.cpu/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.cpu" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cpu/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cpu" linux_amd64_cuda: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.cuda/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.cuda" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cuda/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cuda" linux_armv7: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.arm/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.arm" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm" linux_arm64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.arm64/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.arm64" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm64/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm64" darwin_amd64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.osx/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.osx" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.osx/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.osx" android_arm64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.android-arm64/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.android-arm64" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-arm64/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-arm64" android_armv7: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.android-armv7/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.android-armv7" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-armv7/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-armv7" win_amd64_cpu: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.win/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.win" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win" win_amd64_cuda: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.win-cuda/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2.7ead55807a2ded84c107720ebca61e6285e2c239.1.win-cuda" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win-cuda/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win-cuda" username: 'build-user' homedir: linux: '/home/build-user' diff --git a/tensorflow b/tensorflow index 7ead55807a..518c1d04bf 160000 --- a/tensorflow +++ b/tensorflow @@ -1 +1 @@ -Subproject commit 7ead55807a2ded84c107720ebca61e6285e2c239 +Subproject commit 518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d From 5039fb51d5437970df0648b228404506fed359bc Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 29 Jun 2020 21:00:54 +0200 Subject: [PATCH 6/9] Package generate_scorer_package on Android --- native_client/Android.mk | 2 +- taskcluster/tc-package.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/native_client/Android.mk b/native_client/Android.mk index ee91a7a493..d21551fd1c 100644 --- a/native_client/Android.mk +++ b/native_client/Android.mk @@ -2,7 +2,7 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := deepspeech-prebuilt -LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so $(TFDIR)/bazel-bin/native_client/generate_scorer_package +LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so include $(PREBUILT_SHARED_LIBRARY) include $(CLEAR_VARS) diff --git a/taskcluster/tc-package.sh b/taskcluster/tc-package.sh index ec36bb846f..652805516e 100755 --- a/taskcluster/tc-package.sh +++ b/taskcluster/tc-package.sh @@ -35,6 +35,7 @@ package_native_client() package_native_client_ndk() { deepspeech_dir=${DS_DSDIR} + tensorflow_dir=${DS_TFDIR} artifacts_dir=${TASKCLUSTER_ARTIFACTS} artifact_name=$1 arch_abi=$2 @@ -57,7 +58,7 @@ package_native_client_ndk() tar -cf - \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ deepspeech \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ libdeepspeech.so \ - -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ generate_scorer_package \ + -C ${tensorflow_dir}/bazel-bin/native_client/ generate_scorer_package \ -C ${deepspeech_dir}/native_client/libs/${arch_abi}/ libc++_shared.so \ -C ${deepspeech_dir}/native_client/ deepspeech.h \ -C ${deepspeech_dir}/ LICENSE \ From 2504360e95a24c7c475edf8bb84b49dadcb8b978 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 29 Jun 2020 21:32:29 +0200 Subject: [PATCH 7/9] Handle universal newlines in Alphabet file parsing --- native_client/alphabet.cc | 37 ++++++++++++++++++++++++++++- native_client/ctcdecode/__init__.py | 5 ++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/native_client/alphabet.cc b/native_client/alphabet.cc index 873b4881be..1f0a8dbea2 100644 --- a/native_client/alphabet.cc +++ b/native_client/alphabet.cc @@ -3,6 +3,41 @@ #include +// std::getline, but handle newline conventions from multiple platforms instead +// of just the platform this code was built for +std::istream& +getline_crossplatform(std::istream& is, std::string& t) +{ + t.clear(); + + // The characters in the stream are read one-by-one using a std::streambuf. + // That is faster than reading them one-by-one using the std::istream. + // Code that uses streambuf this way must be guarded by a sentry object. + // The sentry object performs various tasks, + // such as thread synchronization and updating the stream state. + std::istream::sentry se(is, true); + std::streambuf* sb = is.rdbuf(); + + while (true) { + int c = sb->sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if(sb->sgetc() == '\n') + sb->sbumpc(); + return is; + case std::streambuf::traits_type::eof(): + // Also handle the case when the last line has no line ending + if(t.empty()) + is.setstate(std::ios::eofbit); + return is; + default: + t += (char)c; + } + } +} + int Alphabet::init(const char *config_file) { @@ -12,7 +47,7 @@ Alphabet::init(const char *config_file) } unsigned int label = 0; space_label_ = -2; - for (std::string line; std::getline(in, line);) { + for (std::string line; getline_crossplatform(in, line);) { if (line.size() == 2 && line[0] == '\\' && line[1] == '#') { line = '#'; } else if (line[0] == '#') { diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index ac603aa9ba..ee5645d408 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -45,6 +45,11 @@ def __init__(self, config_path): if err != 0: raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err)) + def Encode(self, input): + """Convert SWIG's UnsignedIntVec to a Python list""" + res = super(Alphabet, self).Encode(input) + return [el for el in res] + def ctc_beam_search_decoder(probs_seq, alphabet, From 8f6106b35d2b84af160227640390590a2dc4c3ef Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 30 Jun 2020 16:47:41 +0200 Subject: [PATCH 8/9] Update docs to refer to new generate_scorer_package --- data/README.rst | 4 ++-- doc/Decoder.rst | 6 ++++-- doc/Scorer.rst | 16 ++++++++++------ training/deepspeech_training/util/flags.py | 2 +- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/data/README.rst b/data/README.rst index 4d2c022aaa..aeb3486457 100644 --- a/data/README.rst +++ b/data/README.rst @@ -3,9 +3,9 @@ Language-Specific Data This directory contains language-specific data files. Most importantly, you will find here: -1. A list of unique characters for the target language (e.g. English) in `data/alphabet.txt` +1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt`` -2. A scorer package (`data/lm/kenlm.scorer`) generated with `data/lm/generate_package.py`. The scorer package includes a binary n-gram language model generated with `data/lm/generate_lm.py`. +2. A scorer package (``data/lm/kenlm.scorer``) generated with ``generate_scorer_package`` (``native_client/generate_scorer_package.cpp``). The scorer package includes a binary n-gram language model generated with ``data/lm/generate_lm.py``. For more information on how to build these resources from scratch, see the ``External scorer scripts`` section on `deepspeech.readthedocs.io `_. diff --git a/doc/Decoder.rst b/doc/Decoder.rst index 63e3ac2da6..1115e38e61 100644 --- a/doc/Decoder.rst +++ b/doc/Decoder.rst @@ -56,9 +56,11 @@ At decoding time, the scorer is queried every time a Unicode codepoint is predic **Acoustic models trained with ``--utf8`` MUST NOT be used with an alphabet based scorer. Conversely, acoustic models trained with an alphabet file MUST NOT be used with a UTF-8 scorer.** -UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``data/lm/generate_package.py`` should automatically enable UTF-8 mode, and it should print the message "Looks like a character based model." +UTF-8 scorers can be built by using an input corpus with space separated codepoints. If your corpus only contains single codepoints separated by spaces, ``generate_scorer_package`` should automatically enable UTF-8 mode, and it should print the message "Looks like a character based model." -If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. UTF-8 mode can be forced by specifying the ``--force_utf8`` flag when running ``data/lm/generate_package.py``, but it is NOT RECOMMENDED. +If the message "Doesn't look like a character based model." is printed, you should double check your inputs to make sure it only contains single codepoints separated by spaces. UTF-8 mode can be forced by specifying the ``--force_utf8`` flag when running ``generate_scorer_package``, but it is NOT RECOMMENDED. + +See :ref:`scorer-scripts` for more details on using ``generate_scorer_package``. Because KenLM uses spaces as a word separator, the resulting language model will not include space characters in it. If you wish to use UTF-8 mode but still model spaces, you need to replace spaces in the input corpus with a different character **before** converting it to space separated codepoints. For example: diff --git a/doc/Scorer.rst b/doc/Scorer.rst index 8df94a74ce..42a7e8332d 100644 --- a/doc/Scorer.rst +++ b/doc/Scorer.rst @@ -5,7 +5,9 @@ External scorer scripts DeepSpeech pre-trained models include an external scorer. This document explains how to reproduce our external scorer, as well as adapt the scripts to create your own. -The scorer is composed of two sub-components, a KenLM language model and a trie data structure containing all words in the vocabulary. In order to create the scorer package, first we must create a KenLM language model (using ``data/lm/generate_lm.py``, and then use ``data/lm/generate_package.py`` to create the final package file including the trie data structure. +The scorer is composed of two sub-components, a KenLM language model and a trie data structure containing all words in the vocabulary. In order to create the scorer package, first we must create a KenLM language model (using ``data/lm/generate_lm.py``, and then use ``generate_scorer_package`` to create the final package file including the trie data structure. + +The ``generate_scorer_package`` binary is part of the native client package that is included with official releases. You can find the appropriate archive for your platform in the `GitHub release downloads `_. The native client package is named ``native_client.{arch}.{config}.{plat}.tar.xz``, where ``{arch}`` is the architecture the binary was built for, for example ``amd64`` or ``arm64``, ``config`` is the build configuration, which for building decoder packages does not matter, and ``{plat}`` is the platform the binary was built-for, for example ``linux`` or ``osx``. If you wanted to run the ``generate_scorer_package`` binary on a Linux desktop, you would download ``native_client.amd64.cpu.linux.tar.xz``. Reproducing our external scorer ------------------------------- @@ -36,12 +38,15 @@ Else you have to build `KenLM `_ first and then pa --binary_a_bits 255 --binary_q_bits 8 --binary_type trie -Afterwards you can use ``generate_package.py`` to generate the scorer package using the ``lm.binary`` and ``vocab-500000.txt`` files: +Afterwards you can use ``generate_scorer_package`` to generate the scorer package using the ``lm.binary`` and ``vocab-500000.txt`` files: .. code-block:: bash cd data/lm - python3 generate_package.py --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ + # Download and extract appropriate native_client package: + curl -LO ... + tar xvf native_client.*.tar.xz + ./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 Building your own scorer @@ -51,7 +56,6 @@ Building your own scorer can be useful if you're using models in a narrow usage The LibriSpeech LM training text used by our scorer is around 4GB uncompressed, which should give an idea of the size of a corpus needed for a reasonable language model for general speech recognition. For more constrained use cases with smaller vocabularies, you don't need as much data, but you should still try to gather as much as you can. -With a text corpus in hand, you can then re-use the ``generate_lm.py`` and ``generate_package.py`` scripts to create your own scorer that is compatible with DeepSpeech clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. +With a text corpus in hand, you can then re-use ``generate_lm.py`` and ``generate_scorer_package`` to create your own scorer that is compatible with DeepSpeech clients and language bindings. Before building the language model, you must first familiarize yourself with the `KenLM toolkit `_. Most of the options exposed by the ``generate_lm.py`` script are simply forwarded to KenLM options of the same name, so you must read the KenLM documentation in order to fully understand their behavior. -After using ``generate_lm.py`` to create a KenLM language model binary file, you can use ``generate_package.py`` to create a scorer package as described in the previous section. Note that we have a :github:`lm_optimizer.py script ` which can be used to find good default values for alpha and beta. To use it, you must first -generate a package with any value set for default alpha and beta flags. For this step, it doesn't matter what values you use, as they'll be overridden by ``lm_optimizer.py``. Then, use ``lm_optimizer.py`` with this scorer file to find good alpha and beta values. Finally, use ``generate_package.py`` again, this time with the new values. +After using ``generate_lm.py`` to create a KenLM language model binary file, you can use ``generate_scorer_package`` to create a scorer package as described in the previous section. Note that we have a :github:`lm_optimizer.py script ` which can be used to find good default values for alpha and beta. To use it, you must first generate a package with any value set for default alpha and beta flags. For this step, it doesn't matter what values you use, as they'll be overridden by ``lm_optimizer.py`` later. Then, use ``lm_optimizer.py`` with this scorer file to find good alpha and beta values. Finally, use ``generate_scorer_package`` again, this time with the new values. diff --git a/training/deepspeech_training/util/flags.py b/training/deepspeech_training/util/flags.py index c31eb461e7..6bf6425116 100644 --- a/training/deepspeech_training/util/flags.py +++ b/training/deepspeech_training/util/flags.py @@ -151,7 +151,7 @@ def str_val_equals_help(name, val_desc): f.DEFINE_boolean('utf8', False, 'enable UTF-8 mode. When this is used the model outputs UTF-8 sequences directly rather than using an alphabet mapping.') f.DEFINE_string('alphabet_config_path', 'data/alphabet.txt', 'path to the configuration file specifying the alphabet used by the network. See the comment in data/alphabet.txt for a description of the format.') - f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file created with data/lm/generate_package.py') + f.DEFINE_string('scorer_path', 'data/lm/kenlm.scorer', 'path to the external scorer file.') f.DEFINE_alias('scorer', 'scorer_path') f.DEFINE_integer('beam_width', 1024, 'beam width used in the CTC decoder when building candidate transcriptions') f.DEFINE_float('lm_alpha', 0.931289039105002, 'the alpha hyperparameter of the CTC decoder. Language Model weight.') From 65915c7f57a1b39a36e8d5f8e28595251f1bd752 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 2 Jul 2020 14:09:42 +0200 Subject: [PATCH 9/9] Address review comments --- data/README.rst | 2 +- doc/Scorer.rst | 2 +- taskcluster/.shared.yml | 36 ++++++++++++++++++------------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/data/README.rst b/data/README.rst index aeb3486457..3a60ea5a17 100644 --- a/data/README.rst +++ b/data/README.rst @@ -3,7 +3,7 @@ Language-Specific Data This directory contains language-specific data files. Most importantly, you will find here: -1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt`` +1. A list of unique characters for the target language (e.g. English) in ``data/alphabet.txt``. After installing the training code, you can check ``python -m deepspeech_training.util.check_characters --help`` for a tool that creates an alphabet file from a list of training CSV files. 2. A scorer package (``data/lm/kenlm.scorer``) generated with ``generate_scorer_package`` (``native_client/generate_scorer_package.cpp``). The scorer package includes a binary n-gram language model generated with ``data/lm/generate_lm.py``. diff --git a/doc/Scorer.rst b/doc/Scorer.rst index 42a7e8332d..04ce2d686b 100644 --- a/doc/Scorer.rst +++ b/doc/Scorer.rst @@ -44,7 +44,7 @@ Afterwards you can use ``generate_scorer_package`` to generate the scorer packag cd data/lm # Download and extract appropriate native_client package: - curl -LO ... + curl -LO http://github.com/mozilla/DeepSpeech/releases/... tar xvf native_client.*.tar.xz ./generate_scorer_package --alphabet ../alphabet.txt --lm lm.binary --vocab vocab-500000.txt \ --package kenlm.scorer --default_alpha 0.931289039105002 --default_beta 1.1834137581510284 diff --git a/taskcluster/.shared.yml b/taskcluster/.shared.yml index 46cd698375..ca6cd9ffd4 100644 --- a/taskcluster/.shared.yml +++ b/taskcluster/.shared.yml @@ -142,32 +142,32 @@ system: namespace: "project.deepspeech.swig.win.amd64.b5fea54d39832d1d132d7dd921b69c0c2c9d5118" tensorflow: linux_amd64_cpu: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cpu/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cpu" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.cpu/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.cpu" linux_amd64_cuda: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cuda/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.cuda" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.cuda/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.cuda" linux_armv7: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.arm/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.arm" linux_arm64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm64/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.arm64" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.arm64/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.arm64" darwin_amd64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.osx/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.osx" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.osx/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.osx" android_arm64: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-arm64/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-arm64" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.android-arm64/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.android-arm64" android_armv7: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-armv7/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.android-armv7" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.android-armv7/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.android-armv7" win_amd64_cpu: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.win/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.win" win_amd64_cuda: - url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win-cuda/artifacts/public/home.tar.xz" - namespace: "project.deepspeech.tensorflow.pip.r2.2-submodule.347767452d19a45a6aeb3694e54adce4d945634a.1.win-cuda" + url: "https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.win-cuda/artifacts/public/home.tar.xz" + namespace: "project.deepspeech.tensorflow.pip.r2.2.518c1d04bf55d362bb11e973b8f5d0aa3e5bf44d.0.win-cuda" username: 'build-user' homedir: linux: '/home/build-user'