diff --git a/.gitmodules b/.gitmodules index f25c178..246fd7f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -99,3 +99,6 @@ [submodule "german/de_deepspeech-aashishag"] path = german/de_deepspeech-aashishag url = https://github.com/synesthesiam/de_deepspeech-aashishag.git +[submodule "korean/ko-kr_kaldi-montreal"] + path = korean/ko-kr_kaldi-montreal + url = https://github.com/synesthesiam/ko-kr_kaldi-montreal.git diff --git a/PROFILES b/PROFILES index 64e02e3..89ab54e 100644 --- a/PROFILES +++ b/PROFILES @@ -6,6 +6,7 @@ english/en-us_deepspeech-mozilla english/en-us_julius-github english/en-us_kaldi-zamia english/en-us_pocketsphinx-cmu +french/fr_kaldi-guyot french/fr_pocketsphinx-cmu german/de_deepspeech-aashishag german/de_kaldi-zamia @@ -14,7 +15,7 @@ greek/el-gr_pocketsphinx-cmu hindi/hi_pocketsphinx-cmu italian/it_pocketsphinx-cmu kazakh/kz_pocketsphinx-cmu -korean/ko-KR_kaldi-montreal +korean/ko-kr_kaldi-montreal mandarin/zh-cn_pocketsphinx-cmu polish/pl_julius-github portuguese/pt-br_pocketsphinx-cmu diff --git a/README.md b/README.md index f0d2587..e56ccc0 100644 --- a/README.md +++ b/README.md @@ -41,181 +41,277 @@ Untested profiles (highlighted below) *may* work, but I don't have the necessary - - Download + + + Download + Catalan ca-es pocketsphinx - UNTESTED - UNTESTED + UNTESTED + + UNTESTED + - Download - Dutch (Nederlands) + + Download + + Dutch (Nederlands) nl - pocketsphinx - ★ ★ ★ (36x) - ☹ (6x) + kaldi + ★ ★ ★ ★ ★ (2x) + ☹ (1x) - Download - Dutch (Nederlands) + + Download + + Dutch (Nederlands) nl - kaldi - ★ ★ ★ ★ ★ (17x) - ☹ ☹ ☹ ☹ ☹ (8x) + pocketsphinx + ★ ★ ★ ★ (18x) + ☹ (3x) - Download + + Download + English - en-us - kaldi - ★ ★ ★ ★ ★ (3x) - ★ ★ ★ ★ ★ (4x) + en-in + pocketsphinx + ☹ (4x) + ☹ (4x) - Download + + Download + English en-us - pocketsphinx - ★ ★ ★ ★ ★ (17x) - ★ ★ (2x) + deepspeech + ★ ★ ★ ★ ★ (1x) + ★ ★ ★ ★ (1x) - Download + + Download + English en-us julius - ★ ★ ★ ★ ★ (2x) - ☹ (1x) + ★ ★ ★ ★ (1x) + + UNTESTED + - - Download - Indian English - en-in + + + Download + + English + en-us + kaldi + ★ ★ ★ ★ ★ (3x) + ★ ★ ★ ★ (1x) + + + + Download + + English + en-us pocketsphinx - UNTESTED - UNTESTED + ★ ★ ★ ★ ★ (9x) + ★ ★ ★ ★ (2x) - Download + + Download + + French (Français) + fr + kaldi + ★ ★ ★ ★ (4x) + ★ ★ ★ ★ (1x) + + + + Download + French (Français) fr pocketsphinx - ★ ★ ★ (49x) - ☹ (4x) + ★ ★ ★ ★ (23x) + ☹ (3x) - Download + + Download + German de - kaldi - ★ ★ ★ ★ ★ (3x) + pocketsphinx + ★ ★ ★ ★ ★ (17x) ★ ★ ★ ★ ★ (3x) - Download + + Download + German - de - pocketsphinx - ★ ★ ★ ★ ★ (29x) - ★ ★ ★ ★ ★ (5x) + de-DE + deepspeech + ★ ★ ★ ★ ★ (1x) + ★ ★ ★ ★ (1x) + + + + Download + + German + de-DE + kaldi + ★ ★ ★ ★ ★ (4x) + ★ ★ ★ ★ (1x) - Download - Greek (Ελληνικά) + + Download + + Greek (Ελληνικά) el-gr pocketsphinx - ★ ★ (17x) + ★ ★ ★ ★ ★ (15x) ☹ (1x) - - Download + + + Download + Hindi (Devanagari) hi pocketsphinx - UNTESTED - UNTESTED + UNTESTED + + UNTESTED + - Download + + Download + Italian (Italiano) it pocketsphinx - ★ ★ ★ ★ ★ (39x) - ★ ★ ★ ★ ★ (14x) + ★ ★ ★ ★ ★ (21x) + ★ ★ ★ ★ ★ (7x) - - Download + + + Download + Kazakh (қазақша) kz pocketsphinx - UNTESTED - UNTESTED + UNTESTED + + UNTESTED + - - Download - Mandarin (中文) + + + Download + + Korean + ko-kr + kaldi + ☹ (4x) + ☹ (4x) + + + + Download + + Mandarin zh-cn pocketsphinx - UNTESTED - UNTESTED + UNTESTED + + UNTESTED + - Download + + Download + Polish (polski) pl julius - ★ (1x) - UNTESTED + UNTESTED + + UNTESTED + - Download - Portugese (Português) + + Download + + Portuguese (Português) pt-br pocketsphinx - ★ ★ (77x) - ☹ (20x) + ★ ★ ★ ★ (51x) + ☹ (11x) - Download + + Download + Russian (Русский) ru pocketsphinx - ★ ★ ★ ★ ★ (21x) + ★ ★ ★ ★ ★ (17x) ☹ (1x) - Download + + Download + Spanish (Español) es pocketsphinx - ★ ★ ★ ★ (35x) - ★ ★ ★ (22x) + ★ ★ ★ ★ (25x) + ★ ★ ★ ★ (15x) - - Download - Mexican Spanish + + + Download + + Spanish es-mexican pocketsphinx - UNTESTED - UNTESTED + ★ ★ ★ ★ ★ (9x) + ★ ★ ★ ★ (2x) - Download + + Download + Swedish (svenska) sv kaldi - ★ (13x) + ★ ★ ★ ★ (3x) ☹ (1x) - Download + + Download + Vietnamese (Tiếng Việt) vi kaldi - ★ ★ ★ ★ ★ (10x) - ☹ (0.15x) + ★ ★ ★ ★ ★ (4x) + ☹ (1x) diff --git a/bin/generate-reports.sh b/bin/generate-reports.sh index a2df449..11bfb42 100644 --- a/bin/generate-reports.sh +++ b/bin/generate-reports.sh @@ -110,7 +110,7 @@ for profile in "${profiles[@]}"; do voice2json -p "${dest_dir}" --debug test-examples --open --directory "${open_dir}" | \ jq . > "${open_dir}/${report_name}" - cp "${open_dir}/report.json" "${src_dir}/${profile}/test/open/" + cp "${open_dir}/${report_name}" "${src_dir}/${profile}/test/open/" else echo "${open_dir}" does not exist fi diff --git a/bin/make_performance_table.py b/bin/make_performance_table.py new file mode 100644 index 0000000..0644e74 --- /dev/null +++ b/bin/make_performance_table.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +import math +import json +import os +import sys +from pathlib import Path + +import yaml +from yattag import Doc, indent + +NATIVE = { + "nl": "Nederlands", + "fr": "Français", + "el-gr": "Ελληνικά", + "hi": "Devanagari", + "it": "Italiano", + "kz": "қазақша", + "zh": "中文", + "pl": "polski", + "pt-br": "Português", + "ru": "Русский", + "es": "Español", + "sv": "svenska", + "vi": "Tiếng Việt", +} + +STAR = "★" +SAD_FACE = "☹" + +# ----------------------------------------------------------------------------- + + +def main(): + yaml.SafeLoader.add_constructor("!env", env_constructor) + base_dir = Path(__file__).parent.parent + + closed_reports = {} + open_reports = {} + profile_yml = {} + + for profile_dir in sys.stdin: + profile_dir = base_dir / profile_dir.strip() + profile_name = profile_dir.name + + with open(profile_dir / "profile.yml", "r") as yaml_file: + profile_yml[profile_name] = yaml.safe_load(yaml_file) + + closed_path = profile_dir / "test" / "closed" / "report.json" + open_path = profile_dir / "test" / "open" / "report.json" + + if not open_path.is_file(): + # Try alternative name + open_path = profile_dir / "test" / "open" / "report_open.json" + + if closed_path.is_file(): + with open(closed_path, "r") as closed_file: + closed_reports[profile_name] = json.load(closed_file) + + if open_path.is_file(): + with open(open_path, "r") as open_file: + open_reports[profile_name] = json.load(open_file) + else: + print("Missing", closed_path, file=sys.stderr) + + rows = [] + for profile_name in profile_yml: + print("Processing", profile_name, file=sys.stderr) + profile = profile_yml[profile_name] + closed_report = closed_reports.get(profile_name) + open_report = open_reports.get(profile_name) + + row = { + "name": profile_name, + "version": profile["version"], + "language": profile["language"]["name"], + "locale": profile["language"]["code"], + "system": profile["speech-to-text"]["acoustic-model-type"], + } + + if closed_report: + row["closed_accuracy"] = closed_report["transcription_accuracy"] + row["closed_speedup"] = closed_report["average_transcription_speedup"] + + if open_report: + row["open_accuracy"] = open_report["transcription_accuracy"] + row["open_speedup"] = open_report["average_transcription_speedup"] + + rows.append(row) + + # Convert to HTML + rows = sorted(rows, key=lambda r: (r["language"], r["locale"])) + doc, tag, text = Doc().tagtext() + + with tag("table"): + # Header + with tag("thead"): + with tag("tr"): + with tag("th"): + # Download + pass + + with tag("th"): + text("Language") + + with tag("th"): + text("Locale") + + with tag("th"): + text("System") + + with tag("th"): + text("Closed") + + with tag("th"): + text("Open") + + # Body + with tag("tbody"): + for row in rows: + with tag("tr"): + + # Download + with tag("td"): + with tag( + "a", + href=f'https://github.com/synesthesiam/{row["name"]}/archive/v{row["version"]}.tar.gz', + ): + text("Download") + + # Language + with tag("td"): + lang = row["language"] + lang = lang[0].upper() + lang[1:] + + native = NATIVE.get(row["locale"]) + if native: + lang = f"{lang} ({native})" + + text(lang) + + # Locale + with tag("td"): + text(row["locale"]) + + # System + with tag("td"): + text(row["system"]) + + # Closed + with tag("td"): + closed_accuracy = row.get("closed_accuracy") + + if closed_accuracy: + closed_text = to_stars(closed_accuracy) + closed_speedup = row.get("closed_speedup") + if closed_speedup: + closed_speedx = int(math.ceil(float(closed_speedup))) + closed_text = f"{closed_text} ({closed_speedx}x)" + + doc.asis(closed_text) + else: + text("UNTESTED") + + # Open + with tag("td"): + open_accuracy = row.get("open_accuracy") + + if open_accuracy: + open_text = to_stars(open_accuracy) + open_speedup = row.get("open_speedup") + if open_speedup: + open_speedx = int(math.ceil(float(open_speedup))) + open_text = f"{open_text} ({open_speedx}x)" + + doc.asis(open_text) + else: + with tag("strong"): + text("UNTESTED") + + print(indent(doc.getvalue())) + + +# ----------------------------------------------------------------------------- + + +def to_stars(accuracy): + accuracy = float(accuracy) + if accuracy < 0.75: + return SAD_FACE + + num_stars = 5 + + if accuracy < 0.8: + num_stars = 1 + + if accuracy < 0.85: + num_stars = 2 + + if accuracy < 0.90: + num_stars = 3 + + if accuracy < 0.95: + num_stars = 4 + + return " ".join([STAR] * num_stars) + + +def env_constructor(loader, node): + """Expand !env STRING to replace environment variables in STRING.""" + return os.path.expandvars(node.value) + + +# ----------------------------------------------------------------------------- +if __name__ == "__main__": + main() diff --git a/english/en-us_julius-github b/english/en-us_julius-github index d415ce3..2086c33 160000 --- a/english/en-us_julius-github +++ b/english/en-us_julius-github @@ -1 +1 @@ -Subproject commit d415ce3e91e01cc837bf65fb34cd976962dda392 +Subproject commit 2086c33f6e6ef17b551e5d02b48fc33a602cd20b diff --git a/korean/ko-KR_kaldi-montreal b/korean/ko-KR_kaldi-montreal deleted file mode 160000 index 8a5cc6e..0000000 --- a/korean/ko-KR_kaldi-montreal +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8a5cc6e496690c8c88ac0ebebba9755e5a731216 diff --git a/korean/ko-kr_kaldi-montreal b/korean/ko-kr_kaldi-montreal new file mode 160000 index 0000000..83b3d74 --- /dev/null +++ b/korean/ko-kr_kaldi-montreal @@ -0,0 +1 @@ +Subproject commit 83b3d74de5ee9ed5c32a82828b32d211db0ac0a7 diff --git a/polish/pl_julius-github b/polish/pl_julius-github index 428638d..e7b770b 160000 --- a/polish/pl_julius-github +++ b/polish/pl_julius-github @@ -1 +1 @@ -Subproject commit 428638d6e292abe054a8e77df455944836701b45 +Subproject commit e7b770b1368ae8dcab104ceae62a45ad61099c46 diff --git a/requirements.txt b/requirements.txt index fd53361..78b8384 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -bs4 -html5lib -requests -pyyaml -pydash -conllu +beautifulsoup4==4.7.1 +conllu==1.3.1 +html5lib==1.0.1 +pydash==4.7.4 +pyyaml==5.1.2 quart==0.6.15 +requests==2.21.0 +yattag==1.13.2