Skip to content

Commit

Permalink
Add phoneme alignment scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Hansen committed Jun 11, 2020
1 parent cbbc6d6 commit 7fcef89
Show file tree
Hide file tree
Showing 5 changed files with 441 additions and 0 deletions.
157 changes: 157 additions & 0 deletions bin/align_phonemes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/env python3
"""
Attempts to guess the correspondence between a speech system's phonemes and some
other phoneme set (e.g., IPA or eSpeak).
Requires 3 text files:
* phonemes - <word> <phoneme1> <phoneme2> ...
* other - <word> <phoneme1> <phoneme2> ...
* examples - <phoneme> <word> <phoneme1> <phoneme2> ...
Alignment is done in 2 passes:
1. Count all cases where speech/other phoneme lists are the same length for a
given word. Assign phonemes based on highest count first.
2. Go back and assume unaligned phonemes are composed of two "others" instead of
one. Assign remaining phonemes based on highest count first.
"""
import argparse
import re
import typing
from collections import Counter
from dataclasses import dataclass, field


@dataclass
class Word:
"""Word with speech system and other phonemes."""

word: str
phonemes: typing.List[str] = field(default_factory=list)
other: typing.List[str] = field(default_factory=list)


def clean(s: str) -> str:
"""Remove accents."""
return s.strip().replace("'", "")


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"phonemes", help="Path to file with words and speech system phonemes"
)
parser.add_argument("other", help="Path to file with words and other phonemes")
parser.add_argument(
"examples", help="Path to file with speech system phoneme examples"
)
parser.add_argument(
"--missing", default="?", help="String to print for missing phonemes"
)
args = parser.parse_args()

words: typing.Dict[str, Word] = {}

# Load speech system phonemes
with open(args.phonemes, "r") as phoneme_file:
for line in phoneme_file:
line = line.strip()
if line:
word_str, *phonemes = line.split()
word = words.get(word_str)
if not word:
word = Word(word_str)
words[word_str] = word

word.phonemes = [clean(p) for p in phonemes]

# Load other phonemes
with open(args.other, "r") as other_file:
for line in other_file:
line = line.strip()
if line:
word_str, *others = line.split()
word = words.get(word_str)
if not word:
word = Word(word_str)
words[word_str] = word

word.other = [clean(o) for o in others]

# Load phoneme examples
all_phonemes: typing.Set[str] = set()
phoneme_example: typing.Dict[str, str] = {}

with open(args.examples, "r") as examples_file:
for line in examples_file:
line = line.strip()
if line:
phoneme, example, *phonemes = line.split()
all_phonemes.add(phoneme)
phoneme_example[phoneme] = example

# -------------------------------------------------------------------------

# phoneme -> other
assignments: typing.Dict[str, str] = {}

# ------
# Pass 1
# ------
# Find candidates with identical lengths.
phoneme_other = Counter()
for word in words.values():
if len(word.phonemes) == len(word.other):
for phoneme, other in zip(word.phonemes, word.other):
phoneme_other[(phoneme, other)] += 1

# Assign naively based purely on count
for candidate, count in phoneme_other.most_common():
phoneme, other = candidate
if (phoneme not in assignments) and (other not in assignments.values()):
assignments[phoneme] = other

# ------
# Pass 2
# ------
# Assume unassigned phonemes map to two "others".
assigned_others = set(assignments.values())
unassigned = all_phonemes - set(assignments)
if unassigned:
for word in words.values():
if len(word.other) > len(word.phonemes):
others = list(word.other)
for phoneme in word.phonemes:
if not others:
# No more others left
break

if (phoneme in unassigned) and (len(others) >= 2):
# Grab two "others" for this unassigned phoneme
phoneme_other[(phoneme, "".join(others[:2]))] += 1
else:
# Skip over "other" for already assigned phoneme
others = others[1:]

# Do assignent again with (hopefully) new candidates
for candidate, count in phoneme_other.most_common():
phoneme, other = candidate
if (phoneme not in assignments) and (other not in assignments.values()):
assignments[phoneme] = other

# -------------------------------------------------------------------------

# Print candidates and ? for phonemes with no candidates
for phoneme in sorted(all_phonemes):
print(
phoneme,
assignments.get(phoneme, args.missing),
phoneme_example.get(phoneme, ""),
)


# -----------------------------------------------------------------------------

if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions bin/get_frequent_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""
Reads through an ARPA language model and prints out the N most likely words
(1-grams) based on their negative log likelihoods.
Example:
$ python3 get_frequent_words.py 100 < ARPA.lm
"""
import argparse
import heapq
import sys

# Ignore start/stop sentence tokens
_IGNORE_WORDS = set(["<s>", "</s>"])


def main():
parser = argparse.ArgumentParser()
parser.add_argument("n", type=int, help="Number of words")
args = parser.parse_args()

print("Reading ARPA language model from stdin...", file=sys.stderr)

in_1grams = False
frequent_words = []

for line in sys.stdin:
line = line.strip()
if not line:
continue

if line.startswith("\\"):
if in_1grams:
# Must be past 1-grams now
break
elif line == "\\1-grams:":
in_1grams = True
elif in_1grams:
# Parse 1-gram
prob, word, *rest = line.split()
prob = float(prob)
word = word.strip()

if (not word) or (word in _IGNORE_WORDS):
# Skip empty or ignored words
continue

if len(frequent_words) < args.n:
# Append to heap
heapq.heappush(frequent_words, (prob, word))
else:
# Replace least likely element
heapq.heappushpop(frequent_words, (prob, word))

# Print the n most frequent words
for prob, word in frequent_words:
print(prob, word)


# -----------------------------------------------------------------------------

if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions bin/words2espeak.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash
# Reads words from standard in and prints their eSpeak phonemes to standard out.

espeak_args=()
while [[ -n "$1" ]];
do
if [[ "$1" == '--print-word' ]];
then
print_word='1'
else
espeak_args+=("$1")
fi

shift
done

# -----------------------------------------------------------------------------

echo 'Reading words from stdin...' >&2

while read line || [[ -n "${line}" ]];
do
if [[ -n "${print_word}" ]]; then
echo -n "${line} "
fi

phones="$(espeak-ng "${espeak_args[@]}" -q -x --sep=' ' "${line}" | sed -e 's/^[[:space:]]*//')"
echo "${phones}"
done
110 changes: 110 additions & 0 deletions bin/words2ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
"""
Reads words from standard in and print their IPA representation to standard out.
Uses the epitran library.
https://github.com/dmort27/epitran
"""
import argparse
import sys

from epitran import Epitran

# aar-Latn Afar
# amh-Ethi Amharic
# ara-Arab Literary Arabic
# aze-Cyrl Azerbaijani (Cyrillic)
# aze-Latn Azerbaijani (Latin)
# ben-Beng Bengali
# ben-Beng-red Bengali (reduced)
# cat-Latn Catalan
# ceb-Latn Cebuano
# cmn-Hans Mandarin (Simplified)*
# cmn-Hant Mandarin (Traditional)*
# ckb-Arab Sorani
# deu-Latn German
# deu-Latn-np German†
# deu-Latn-nar German (more phonetic)
# eng-Latn English‡
# fas-Arab Farsi (Perso-Arabic)
# fra-Latn French
# fra-Latn-np French†
# hau-Latn Hausa
# hin-Deva Hindi
# hun-Latn Hungarian
# ilo-Latn Ilocano
# ind-Latn Indonesian
# ita-Latn Italian
# jav-Latn Javanese
# kaz-Cyrl Kazakh (Cyrillic)
# kaz-Latn Kazakh (Latin)
# kin-Latn Kinyarwanda
# kir-Arab Kyrgyz (Perso-Arabic)
# kir-Cyrl Kyrgyz (Cyrillic)
# kir-Latn Kyrgyz (Latin)
# kmr-Latn Kurmanji
# lao-Laoo Lao
# mar-Deva Marathi
# mlt-Latn Maltese
# mya-Mymr Burmese
# msa-Latn Malay
# nld-Latn Dutch
# nya-Latn Chichewa
# orm-Latn Oromo
# pan-Guru Punjabi (Eastern)
# pol-Latn Polish
# por-Latn Portuguese
# ron-Latn Romanian
# rus-Cyrl Russian
# sna-Latn Shona
# som-Latn Somali
# spa-Latn Spanish
# swa-Latn Swahili
# swe-Latn Swedish
# tam-Taml Tamil
# tel-Telu Telugu
# tgk-Cyrl Tajik
# tgl-Latn Tagalog
# tha-Thai Thai
# tir-Ethi Tigrinya
# tpi-Latn Tok Pisin
# tuk-Cyrl Turkmen (Cyrillic)
# tuk-Latn Turkmen (Latin)
# tur-Latn Turkish (Latin)
# ukr-Cyrl Ukranian
# uig-Arab Uyghur (Perso-Arabic)
# uzb-Cyrl Uzbek (Cyrillic)
# uzb-Latn Uzbek (Latin)
# vie-Latn Vietnamese
# xho-Latn Xhosa
# yor-Latn Yoruba
# zul-Latn Zulu


def main():
parser = argparse.ArgumentParser(prog="words2ipa.py")
parser.add_argument("language", help="epitran language code (e.g., eng-Latn)")
parser.add_argument(
"--print-word", action="store_true", help="Print word before IPA"
)
parser.add_argument("--sep", help="Separator between IPA symbols (default: none)")
args = parser.parse_args()

e = Epitran(args.language)

print("Reading words from stdin...", file=sys.stderr)
for word in sys.stdin:
word = word.strip()
if word:
if args.print_word:
print(word, end=" ")

ipa = e.trans_list(word)
if args.sep:
print(args.sep.join(ipa))
else:
print("".join(ipa))


if __name__ == "__main__":
main()
Loading

0 comments on commit 7fcef89

Please sign in to comment.