diff --git a/bin/align_phonemes.py b/bin/align_phonemes.py new file mode 100644 index 0000000..978d2a2 --- /dev/null +++ b/bin/align_phonemes.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Attempts to guess the correspondence between a speech system's phonemes and some +other phoneme set (e.g., IPA or eSpeak). + +Requires 3 text files: +* phonemes - ... +* other - ... +* examples - ... + +Alignment is done in 2 passes: + +1. Count all cases where speech/other phoneme lists are the same length for a +given word. Assign phonemes based on highest count first. + +2. Go back and assume unaligned phonemes are composed of two "others" instead of +one. Assign remaining phonemes based on highest count first. +""" +import argparse +import re +import typing +from collections import Counter +from dataclasses import dataclass, field + + +@dataclass +class Word: + """Word with speech system and other phonemes.""" + + word: str + phonemes: typing.List[str] = field(default_factory=list) + other: typing.List[str] = field(default_factory=list) + + +def clean(s: str) -> str: + """Remove accents.""" + return s.strip().replace("'", "") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "phonemes", help="Path to file with words and speech system phonemes" + ) + parser.add_argument("other", help="Path to file with words and other phonemes") + parser.add_argument( + "examples", help="Path to file with speech system phoneme examples" + ) + parser.add_argument( + "--missing", default="?", help="String to print for missing phonemes" + ) + args = parser.parse_args() + + words: typing.Dict[str, Word] = {} + + # Load speech system phonemes + with open(args.phonemes, "r") as phoneme_file: + for line in phoneme_file: + line = line.strip() + if line: + word_str, *phonemes = line.split() + word = words.get(word_str) + if not word: + word = Word(word_str) + words[word_str] = word + + word.phonemes = [clean(p) for p in phonemes] + + # Load other phonemes + with open(args.other, "r") as other_file: + for line in other_file: + line = line.strip() + if line: + word_str, *others = line.split() + word = words.get(word_str) + if not word: + word = Word(word_str) + words[word_str] = word + + word.other = [clean(o) for o in others] + + # Load phoneme examples + all_phonemes: typing.Set[str] = set() + phoneme_example: typing.Dict[str, str] = {} + + with open(args.examples, "r") as examples_file: + for line in examples_file: + line = line.strip() + if line: + phoneme, example, *phonemes = line.split() + all_phonemes.add(phoneme) + phoneme_example[phoneme] = example + + # ------------------------------------------------------------------------- + + # phoneme -> other + assignments: typing.Dict[str, str] = {} + + # ------ + # Pass 1 + # ------ + # Find candidates with identical lengths. + phoneme_other = Counter() + for word in words.values(): + if len(word.phonemes) == len(word.other): + for phoneme, other in zip(word.phonemes, word.other): + phoneme_other[(phoneme, other)] += 1 + + # Assign naively based purely on count + for candidate, count in phoneme_other.most_common(): + phoneme, other = candidate + if (phoneme not in assignments) and (other not in assignments.values()): + assignments[phoneme] = other + + # ------ + # Pass 2 + # ------ + # Assume unassigned phonemes map to two "others". + assigned_others = set(assignments.values()) + unassigned = all_phonemes - set(assignments) + if unassigned: + for word in words.values(): + if len(word.other) > len(word.phonemes): + others = list(word.other) + for phoneme in word.phonemes: + if not others: + # No more others left + break + + if (phoneme in unassigned) and (len(others) >= 2): + # Grab two "others" for this unassigned phoneme + phoneme_other[(phoneme, "".join(others[:2]))] += 1 + else: + # Skip over "other" for already assigned phoneme + others = others[1:] + + # Do assignent again with (hopefully) new candidates + for candidate, count in phoneme_other.most_common(): + phoneme, other = candidate + if (phoneme not in assignments) and (other not in assignments.values()): + assignments[phoneme] = other + + # ------------------------------------------------------------------------- + + # Print candidates and ? for phonemes with no candidates + for phoneme in sorted(all_phonemes): + print( + phoneme, + assignments.get(phoneme, args.missing), + phoneme_example.get(phoneme, ""), + ) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/bin/get_frequent_words.py b/bin/get_frequent_words.py new file mode 100644 index 0000000..dbf2280 --- /dev/null +++ b/bin/get_frequent_words.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Reads through an ARPA language model and prints out the N most likely words +(1-grams) based on their negative log likelihoods. + +Example: +$ python3 get_frequent_words.py 100 < ARPA.lm +""" +import argparse +import heapq +import sys + +# Ignore start/stop sentence tokens +_IGNORE_WORDS = set(["", ""]) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("n", type=int, help="Number of words") + args = parser.parse_args() + + print("Reading ARPA language model from stdin...", file=sys.stderr) + + in_1grams = False + frequent_words = [] + + for line in sys.stdin: + line = line.strip() + if not line: + continue + + if line.startswith("\\"): + if in_1grams: + # Must be past 1-grams now + break + elif line == "\\1-grams:": + in_1grams = True + elif in_1grams: + # Parse 1-gram + prob, word, *rest = line.split() + prob = float(prob) + word = word.strip() + + if (not word) or (word in _IGNORE_WORDS): + # Skip empty or ignored words + continue + + if len(frequent_words) < args.n: + # Append to heap + heapq.heappush(frequent_words, (prob, word)) + else: + # Replace least likely element + heapq.heappushpop(frequent_words, (prob, word)) + + # Print the n most frequent words + for prob, word in frequent_words: + print(prob, word) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/bin/words2espeak.sh b/bin/words2espeak.sh new file mode 100644 index 0000000..0237028 --- /dev/null +++ b/bin/words2espeak.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Reads words from standard in and prints their eSpeak phonemes to standard out. + +espeak_args=() +while [[ -n "$1" ]]; +do + if [[ "$1" == '--print-word' ]]; + then + print_word='1' + else + espeak_args+=("$1") + fi + + shift +done + +# ----------------------------------------------------------------------------- + +echo 'Reading words from stdin...' >&2 + +while read line || [[ -n "${line}" ]]; +do + if [[ -n "${print_word}" ]]; then + echo -n "${line} " + fi + + phones="$(espeak-ng "${espeak_args[@]}" -q -x --sep=' ' "${line}" | sed -e 's/^[[:space:]]*//')" + echo "${phones}" +done diff --git a/bin/words2ipa.py b/bin/words2ipa.py new file mode 100644 index 0000000..1ee829c --- /dev/null +++ b/bin/words2ipa.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Reads words from standard in and print their IPA representation to standard out. + +Uses the epitran library. +https://github.com/dmort27/epitran +""" +import argparse +import sys + +from epitran import Epitran + +# aar-Latn Afar +# amh-Ethi Amharic +# ara-Arab Literary Arabic +# aze-Cyrl Azerbaijani (Cyrillic) +# aze-Latn Azerbaijani (Latin) +# ben-Beng Bengali +# ben-Beng-red Bengali (reduced) +# cat-Latn Catalan +# ceb-Latn Cebuano +# cmn-Hans Mandarin (Simplified)* +# cmn-Hant Mandarin (Traditional)* +# ckb-Arab Sorani +# deu-Latn German +# deu-Latn-np German† +# deu-Latn-nar German (more phonetic) +# eng-Latn English‡ +# fas-Arab Farsi (Perso-Arabic) +# fra-Latn French +# fra-Latn-np French† +# hau-Latn Hausa +# hin-Deva Hindi +# hun-Latn Hungarian +# ilo-Latn Ilocano +# ind-Latn Indonesian +# ita-Latn Italian +# jav-Latn Javanese +# kaz-Cyrl Kazakh (Cyrillic) +# kaz-Latn Kazakh (Latin) +# kin-Latn Kinyarwanda +# kir-Arab Kyrgyz (Perso-Arabic) +# kir-Cyrl Kyrgyz (Cyrillic) +# kir-Latn Kyrgyz (Latin) +# kmr-Latn Kurmanji +# lao-Laoo Lao +# mar-Deva Marathi +# mlt-Latn Maltese +# mya-Mymr Burmese +# msa-Latn Malay +# nld-Latn Dutch +# nya-Latn Chichewa +# orm-Latn Oromo +# pan-Guru Punjabi (Eastern) +# pol-Latn Polish +# por-Latn Portuguese +# ron-Latn Romanian +# rus-Cyrl Russian +# sna-Latn Shona +# som-Latn Somali +# spa-Latn Spanish +# swa-Latn Swahili +# swe-Latn Swedish +# tam-Taml Tamil +# tel-Telu Telugu +# tgk-Cyrl Tajik +# tgl-Latn Tagalog +# tha-Thai Thai +# tir-Ethi Tigrinya +# tpi-Latn Tok Pisin +# tuk-Cyrl Turkmen (Cyrillic) +# tuk-Latn Turkmen (Latin) +# tur-Latn Turkish (Latin) +# ukr-Cyrl Ukranian +# uig-Arab Uyghur (Perso-Arabic) +# uzb-Cyrl Uzbek (Cyrillic) +# uzb-Latn Uzbek (Latin) +# vie-Latn Vietnamese +# xho-Latn Xhosa +# yor-Latn Yoruba +# zul-Latn Zulu + + +def main(): + parser = argparse.ArgumentParser(prog="words2ipa.py") + parser.add_argument("language", help="epitran language code (e.g., eng-Latn)") + parser.add_argument( + "--print-word", action="store_true", help="Print word before IPA" + ) + parser.add_argument("--sep", help="Separator between IPA symbols (default: none)") + args = parser.parse_args() + + e = Epitran(args.language) + + print("Reading words from stdin...", file=sys.stderr) + for word in sys.stdin: + word = word.strip() + if word: + if args.print_word: + print(word, end=" ") + + ipa = e.trans_list(word) + if args.sep: + print(args.sep.join(ipa)) + else: + print("".join(ipa)) + + +if __name__ == "__main__": + main() diff --git a/bin/words2phonemes.py b/bin/words2phonemes.py new file mode 100644 index 0000000..6e14f99 --- /dev/null +++ b/bin/words2phonemes.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Reads words from standard in and prints their phonetic pronunciations on standard out. + +Requires a pre-built phonetic dictionary. +""" +import argparse +import sys +import re +from collections import defaultdict + + +def main(): + parser = argparse.ArgumentParser(prog="words2phonemes.py") + parser.add_argument("dictionary", help="Phonetic dictionary (CMU format)") + parser.add_argument( + "--print-word", action="store_true", help="Print word before phonemes" + ) + parser.add_argument( + "--sep", default=" ", help="Separator between phonemes (default: space)" + ) + parser.add_argument( + "--case", choices=["upper", "lower"], help="Case transformation (default: none)" + ) + args = parser.parse_args() + + transform = lambda s: s + if args.case == "upper": + transform = str.upper + elif args.case == "lower": + transform = str.lower + + pron_dict = read_dict(args.dictionary, transform=transform) + + print("Reading words from stdin...", file=sys.stderr) + for word in sys.stdin: + word = transform(word.strip()) + if word: + if args.print_word: + print(word, end=" ") + + prons = pron_dict.get(word) + if prons: + print(args.sep.join(prons[0])) + else: + print("") + + +def read_dict(dict_path, transform=None): + """Load a CMU pronunciation dictionary.""" + word_pronunciations = defaultdict(list) + + with open(dict_path, "r") as dict_file: + for line in dict_file: + line = line.strip() + if len(line) == 0: + continue + + # Use explicit whitespace (avoid 0xA0) + parts = re.split(r"[ \t]+", line) + word = parts[0] + + if "(" in word: + word = word[: word.index("(")] + + if transform: + word = transform(word) + + # Exclude meta words from Julius dictionaries + if parts[1].startswith("["): + parts = parts[1:] + + pronunciation = [p for p in parts[1:]] + word_pronunciations[word].append(pronunciation) + + return word_pronunciations + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + main()