Add phoneme alignment scripts

synesthesiam · Jun 11, 2020 · 7fcef89 · 7fcef89
1 parent cbbc6d6
commit 7fcef89
Show file tree

Hide file tree

Showing 5 changed files with 441 additions and 0 deletions.
diff --git a/bin/align_phonemes.py b/bin/align_phonemes.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Attempts to guess the correspondence between a speech system's phonemes and some
+other phoneme set (e.g., IPA or eSpeak).
+
+Requires 3 text files:
+* phonemes - <word> <phoneme1> <phoneme2> ...
+* other - <word> <phoneme1> <phoneme2> ...
+* examples - <phoneme> <word> <phoneme1> <phoneme2> ...
+
+Alignment is done in 2 passes:
+
+1. Count all cases where speech/other phoneme lists are the same length for a
+given word. Assign phonemes based on highest count first.
+
+2. Go back and assume unaligned phonemes are composed of two "others" instead of
+one. Assign remaining phonemes based on highest count first.
+"""
+import argparse
+import re
+import typing
+from collections import Counter
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Word:
+    """Word with speech system and other phonemes."""
+
+    word: str
+    phonemes: typing.List[str] = field(default_factory=list)
+    other: typing.List[str] = field(default_factory=list)
+
+
+def clean(s: str) -> str:
+    """Remove accents."""
+    return s.strip().replace("'", "")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "phonemes", help="Path to file with words and speech system phonemes"
+    )
+    parser.add_argument("other", help="Path to file with words and other phonemes")
+    parser.add_argument(
+        "examples", help="Path to file with speech system phoneme examples"
+    )
+    parser.add_argument(
+        "--missing", default="?", help="String to print for missing phonemes"
+    )
+    args = parser.parse_args()
+
+    words: typing.Dict[str, Word] = {}
+
+    # Load speech system phonemes
+    with open(args.phonemes, "r") as phoneme_file:
+        for line in phoneme_file:
+            line = line.strip()
+            if line:
+                word_str, *phonemes = line.split()
+                word = words.get(word_str)
+                if not word:
+                    word = Word(word_str)
+                    words[word_str] = word
+
+                word.phonemes = [clean(p) for p in phonemes]
+
+    # Load other phonemes
+    with open(args.other, "r") as other_file:
+        for line in other_file:
+            line = line.strip()
+            if line:
+                word_str, *others = line.split()
+                word = words.get(word_str)
+                if not word:
+                    word = Word(word_str)
+                    words[word_str] = word
+
+                word.other = [clean(o) for o in others]
+
+    # Load phoneme examples
+    all_phonemes: typing.Set[str] = set()
+    phoneme_example: typing.Dict[str, str] = {}
+
+    with open(args.examples, "r") as examples_file:
+        for line in examples_file:
+            line = line.strip()
+            if line:
+                phoneme, example, *phonemes = line.split()
+                all_phonemes.add(phoneme)
+                phoneme_example[phoneme] = example
+
+    # -------------------------------------------------------------------------
+
+    # phoneme -> other
+    assignments: typing.Dict[str, str] = {}
+
+    # ------
+    # Pass 1
+    # ------
+    # Find candidates with identical lengths.
+    phoneme_other = Counter()
+    for word in words.values():
+        if len(word.phonemes) == len(word.other):
+            for phoneme, other in zip(word.phonemes, word.other):
+                phoneme_other[(phoneme, other)] += 1
+
+    # Assign naively based purely on count
+    for candidate, count in phoneme_other.most_common():
+        phoneme, other = candidate
+        if (phoneme not in assignments) and (other not in assignments.values()):
+            assignments[phoneme] = other
+
+    # ------
+    # Pass 2
+    # ------
+    # Assume unassigned phonemes map to two "others".
+    assigned_others = set(assignments.values())
+    unassigned = all_phonemes - set(assignments)
+    if unassigned:
+        for word in words.values():
+            if len(word.other) > len(word.phonemes):
+                others = list(word.other)
+                for phoneme in word.phonemes:
+                    if not others:
+                        # No more others left
+                        break
+
+                    if (phoneme in unassigned) and (len(others) >= 2):
+                        # Grab two "others" for this unassigned phoneme
+                        phoneme_other[(phoneme, "".join(others[:2]))] += 1
+                    else:
+                        # Skip over "other" for already assigned phoneme
+                        others = others[1:]
+
+    # Do assignent again with (hopefully) new candidates
+    for candidate, count in phoneme_other.most_common():
+        phoneme, other = candidate
+        if (phoneme not in assignments) and (other not in assignments.values()):
+            assignments[phoneme] = other
+
+    # -------------------------------------------------------------------------
+
+    # Print candidates and ? for phonemes with no candidates
+    for phoneme in sorted(all_phonemes):
+        print(
+            phoneme,
+            assignments.get(phoneme, args.missing),
+            phoneme_example.get(phoneme, ""),
+        )
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/get_frequent_words.py b/bin/get_frequent_words.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""
+Reads through an ARPA language model and prints out the N most likely words
+(1-grams) based on their negative log likelihoods.
+
+Example:
+$ python3 get_frequent_words.py 100 < ARPA.lm
+"""
+import argparse
+import heapq
+import sys
+
+# Ignore start/stop sentence tokens
+_IGNORE_WORDS = set(["<s>", "</s>"])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("n", type=int, help="Number of words")
+    args = parser.parse_args()
+
+    print("Reading ARPA language model from stdin...", file=sys.stderr)
+
+    in_1grams = False
+    frequent_words = []
+
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+
+        if line.startswith("\\"):
+            if in_1grams:
+                # Must be past 1-grams now
+                break
+            elif line == "\\1-grams:":
+                in_1grams = True
+        elif in_1grams:
+            # Parse 1-gram
+            prob, word, *rest = line.split()
+            prob = float(prob)
+            word = word.strip()
+
+            if (not word) or (word in _IGNORE_WORDS):
+                # Skip empty or ignored words
+                continue
+
+            if len(frequent_words) < args.n:
+                # Append to heap
+                heapq.heappush(frequent_words, (prob, word))
+            else:
+                # Replace least likely element
+                heapq.heappushpop(frequent_words, (prob, word))
+
+    # Print the n most frequent words
+    for prob, word in frequent_words:
+        print(prob, word)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/words2espeak.sh b/bin/words2espeak.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Reads words from standard in and prints their eSpeak phonemes to standard out.
+
+espeak_args=()
+while [[ -n "$1" ]];
+do
+    if [[ "$1" == '--print-word' ]];
+    then
+        print_word='1'
+    else
+        espeak_args+=("$1")
+    fi
+
+    shift
+done
+
+# -----------------------------------------------------------------------------
+
+echo 'Reading words from stdin...' >&2
+
+while read line || [[ -n "${line}" ]];
+do
+    if [[ -n "${print_word}" ]]; then
+        echo -n "${line} "
+    fi
+
+    phones="$(espeak-ng "${espeak_args[@]}" -q -x --sep=' ' "${line}" | sed -e 's/^[[:space:]]*//')"
+    echo "${phones}"
+done
diff --git a/bin/words2ipa.py b/bin/words2ipa.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Reads words from standard in and print their IPA representation to standard out.
+
+Uses the epitran library.
+https://github.com/dmort27/epitran
+"""
+import argparse
+import sys
+
+from epitran import Epitran
+
+# aar-Latn 	Afar
+# amh-Ethi 	Amharic
+# ara-Arab 	Literary Arabic
+# aze-Cyrl 	Azerbaijani (Cyrillic)
+# aze-Latn 	Azerbaijani (Latin)
+# ben-Beng 	Bengali
+# ben-Beng-red 	Bengali (reduced)
+# cat-Latn 	Catalan
+# ceb-Latn 	Cebuano
+# cmn-Hans 	Mandarin (Simplified)*
+# cmn-Hant 	Mandarin (Traditional)*
+# ckb-Arab 	Sorani
+# deu-Latn 	German
+# deu-Latn-np 	German†
+# deu-Latn-nar 	German (more phonetic)
+# eng-Latn 	English‡
+# fas-Arab 	Farsi (Perso-Arabic)
+# fra-Latn 	French
+# fra-Latn-np 	French†
+# hau-Latn 	Hausa
+# hin-Deva 	Hindi
+# hun-Latn 	Hungarian
+# ilo-Latn 	Ilocano
+# ind-Latn 	Indonesian
+# ita-Latn 	Italian
+# jav-Latn 	Javanese
+# kaz-Cyrl 	Kazakh (Cyrillic)
+# kaz-Latn 	Kazakh (Latin)
+# kin-Latn 	Kinyarwanda
+# kir-Arab 	Kyrgyz (Perso-Arabic)
+# kir-Cyrl 	Kyrgyz (Cyrillic)
+# kir-Latn 	Kyrgyz (Latin)
+# kmr-Latn 	Kurmanji
+# lao-Laoo 	Lao
+# mar-Deva 	Marathi
+# mlt-Latn 	Maltese
+# mya-Mymr 	Burmese
+# msa-Latn 	Malay
+# nld-Latn 	Dutch
+# nya-Latn 	Chichewa
+# orm-Latn 	Oromo
+# pan-Guru 	Punjabi (Eastern)
+# pol-Latn 	Polish
+# por-Latn 	Portuguese
+# ron-Latn 	Romanian
+# rus-Cyrl 	Russian
+# sna-Latn 	Shona
+# som-Latn 	Somali
+# spa-Latn 	Spanish
+# swa-Latn 	Swahili
+# swe-Latn 	Swedish
+# tam-Taml 	Tamil
+# tel-Telu 	Telugu
+# tgk-Cyrl 	Tajik
+# tgl-Latn 	Tagalog
+# tha-Thai 	Thai
+# tir-Ethi 	Tigrinya
+# tpi-Latn 	Tok Pisin
+# tuk-Cyrl 	Turkmen (Cyrillic)
+# tuk-Latn 	Turkmen (Latin)
+# tur-Latn 	Turkish (Latin)
+# ukr-Cyrl 	Ukranian
+# uig-Arab 	Uyghur (Perso-Arabic)
+# uzb-Cyrl 	Uzbek (Cyrillic)
+# uzb-Latn 	Uzbek (Latin)
+# vie-Latn 	Vietnamese
+# xho-Latn 	Xhosa
+# yor-Latn 	Yoruba
+# zul-Latn 	Zulu
+
+
+def main():
+    parser = argparse.ArgumentParser(prog="words2ipa.py")
+    parser.add_argument("language", help="epitran language code (e.g., eng-Latn)")
+    parser.add_argument(
+        "--print-word", action="store_true", help="Print word before IPA"
+    )
+    parser.add_argument("--sep", help="Separator between IPA symbols (default: none)")
+    args = parser.parse_args()
+
+    e = Epitran(args.language)
+
+    print("Reading words from stdin...", file=sys.stderr)
+    for word in sys.stdin:
+        word = word.strip()
+        if word:
+            if args.print_word:
+                print(word, end=" ")
+
+            ipa = e.trans_list(word)
+            if args.sep:
+                print(args.sep.join(ipa))
+            else:
+                print("".join(ipa))
+
+
+if __name__ == "__main__":
+    main()