-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Michael Hansen
committed
Jun 11, 2020
1 parent
cbbc6d6
commit 7fcef89
Showing
5 changed files
with
441 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Attempts to guess the correspondence between a speech system's phonemes and some | ||
other phoneme set (e.g., IPA or eSpeak). | ||
Requires 3 text files: | ||
* phonemes - <word> <phoneme1> <phoneme2> ... | ||
* other - <word> <phoneme1> <phoneme2> ... | ||
* examples - <phoneme> <word> <phoneme1> <phoneme2> ... | ||
Alignment is done in 2 passes: | ||
1. Count all cases where speech/other phoneme lists are the same length for a | ||
given word. Assign phonemes based on highest count first. | ||
2. Go back and assume unaligned phonemes are composed of two "others" instead of | ||
one. Assign remaining phonemes based on highest count first. | ||
""" | ||
import argparse | ||
import re | ||
import typing | ||
from collections import Counter | ||
from dataclasses import dataclass, field | ||
|
||
|
||
@dataclass | ||
class Word: | ||
"""Word with speech system and other phonemes.""" | ||
|
||
word: str | ||
phonemes: typing.List[str] = field(default_factory=list) | ||
other: typing.List[str] = field(default_factory=list) | ||
|
||
|
||
def clean(s: str) -> str: | ||
"""Remove accents.""" | ||
return s.strip().replace("'", "") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"phonemes", help="Path to file with words and speech system phonemes" | ||
) | ||
parser.add_argument("other", help="Path to file with words and other phonemes") | ||
parser.add_argument( | ||
"examples", help="Path to file with speech system phoneme examples" | ||
) | ||
parser.add_argument( | ||
"--missing", default="?", help="String to print for missing phonemes" | ||
) | ||
args = parser.parse_args() | ||
|
||
words: typing.Dict[str, Word] = {} | ||
|
||
# Load speech system phonemes | ||
with open(args.phonemes, "r") as phoneme_file: | ||
for line in phoneme_file: | ||
line = line.strip() | ||
if line: | ||
word_str, *phonemes = line.split() | ||
word = words.get(word_str) | ||
if not word: | ||
word = Word(word_str) | ||
words[word_str] = word | ||
|
||
word.phonemes = [clean(p) for p in phonemes] | ||
|
||
# Load other phonemes | ||
with open(args.other, "r") as other_file: | ||
for line in other_file: | ||
line = line.strip() | ||
if line: | ||
word_str, *others = line.split() | ||
word = words.get(word_str) | ||
if not word: | ||
word = Word(word_str) | ||
words[word_str] = word | ||
|
||
word.other = [clean(o) for o in others] | ||
|
||
# Load phoneme examples | ||
all_phonemes: typing.Set[str] = set() | ||
phoneme_example: typing.Dict[str, str] = {} | ||
|
||
with open(args.examples, "r") as examples_file: | ||
for line in examples_file: | ||
line = line.strip() | ||
if line: | ||
phoneme, example, *phonemes = line.split() | ||
all_phonemes.add(phoneme) | ||
phoneme_example[phoneme] = example | ||
|
||
# ------------------------------------------------------------------------- | ||
|
||
# phoneme -> other | ||
assignments: typing.Dict[str, str] = {} | ||
|
||
# ------ | ||
# Pass 1 | ||
# ------ | ||
# Find candidates with identical lengths. | ||
phoneme_other = Counter() | ||
for word in words.values(): | ||
if len(word.phonemes) == len(word.other): | ||
for phoneme, other in zip(word.phonemes, word.other): | ||
phoneme_other[(phoneme, other)] += 1 | ||
|
||
# Assign naively based purely on count | ||
for candidate, count in phoneme_other.most_common(): | ||
phoneme, other = candidate | ||
if (phoneme not in assignments) and (other not in assignments.values()): | ||
assignments[phoneme] = other | ||
|
||
# ------ | ||
# Pass 2 | ||
# ------ | ||
# Assume unassigned phonemes map to two "others". | ||
assigned_others = set(assignments.values()) | ||
unassigned = all_phonemes - set(assignments) | ||
if unassigned: | ||
for word in words.values(): | ||
if len(word.other) > len(word.phonemes): | ||
others = list(word.other) | ||
for phoneme in word.phonemes: | ||
if not others: | ||
# No more others left | ||
break | ||
|
||
if (phoneme in unassigned) and (len(others) >= 2): | ||
# Grab two "others" for this unassigned phoneme | ||
phoneme_other[(phoneme, "".join(others[:2]))] += 1 | ||
else: | ||
# Skip over "other" for already assigned phoneme | ||
others = others[1:] | ||
|
||
# Do assignent again with (hopefully) new candidates | ||
for candidate, count in phoneme_other.most_common(): | ||
phoneme, other = candidate | ||
if (phoneme not in assignments) and (other not in assignments.values()): | ||
assignments[phoneme] = other | ||
|
||
# ------------------------------------------------------------------------- | ||
|
||
# Print candidates and ? for phonemes with no candidates | ||
for phoneme in sorted(all_phonemes): | ||
print( | ||
phoneme, | ||
assignments.get(phoneme, args.missing), | ||
phoneme_example.get(phoneme, ""), | ||
) | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Reads through an ARPA language model and prints out the N most likely words | ||
(1-grams) based on their negative log likelihoods. | ||
Example: | ||
$ python3 get_frequent_words.py 100 < ARPA.lm | ||
""" | ||
import argparse | ||
import heapq | ||
import sys | ||
|
||
# Ignore start/stop sentence tokens | ||
_IGNORE_WORDS = set(["<s>", "</s>"]) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("n", type=int, help="Number of words") | ||
args = parser.parse_args() | ||
|
||
print("Reading ARPA language model from stdin...", file=sys.stderr) | ||
|
||
in_1grams = False | ||
frequent_words = [] | ||
|
||
for line in sys.stdin: | ||
line = line.strip() | ||
if not line: | ||
continue | ||
|
||
if line.startswith("\\"): | ||
if in_1grams: | ||
# Must be past 1-grams now | ||
break | ||
elif line == "\\1-grams:": | ||
in_1grams = True | ||
elif in_1grams: | ||
# Parse 1-gram | ||
prob, word, *rest = line.split() | ||
prob = float(prob) | ||
word = word.strip() | ||
|
||
if (not word) or (word in _IGNORE_WORDS): | ||
# Skip empty or ignored words | ||
continue | ||
|
||
if len(frequent_words) < args.n: | ||
# Append to heap | ||
heapq.heappush(frequent_words, (prob, word)) | ||
else: | ||
# Replace least likely element | ||
heapq.heappushpop(frequent_words, (prob, word)) | ||
|
||
# Print the n most frequent words | ||
for prob, word in frequent_words: | ||
print(prob, word) | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env bash | ||
# Reads words from standard in and prints their eSpeak phonemes to standard out. | ||
|
||
espeak_args=() | ||
while [[ -n "$1" ]]; | ||
do | ||
if [[ "$1" == '--print-word' ]]; | ||
then | ||
print_word='1' | ||
else | ||
espeak_args+=("$1") | ||
fi | ||
|
||
shift | ||
done | ||
|
||
# ----------------------------------------------------------------------------- | ||
|
||
echo 'Reading words from stdin...' >&2 | ||
|
||
while read line || [[ -n "${line}" ]]; | ||
do | ||
if [[ -n "${print_word}" ]]; then | ||
echo -n "${line} " | ||
fi | ||
|
||
phones="$(espeak-ng "${espeak_args[@]}" -q -x --sep=' ' "${line}" | sed -e 's/^[[:space:]]*//')" | ||
echo "${phones}" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Reads words from standard in and print their IPA representation to standard out. | ||
Uses the epitran library. | ||
https://github.com/dmort27/epitran | ||
""" | ||
import argparse | ||
import sys | ||
|
||
from epitran import Epitran | ||
|
||
# aar-Latn Afar | ||
# amh-Ethi Amharic | ||
# ara-Arab Literary Arabic | ||
# aze-Cyrl Azerbaijani (Cyrillic) | ||
# aze-Latn Azerbaijani (Latin) | ||
# ben-Beng Bengali | ||
# ben-Beng-red Bengali (reduced) | ||
# cat-Latn Catalan | ||
# ceb-Latn Cebuano | ||
# cmn-Hans Mandarin (Simplified)* | ||
# cmn-Hant Mandarin (Traditional)* | ||
# ckb-Arab Sorani | ||
# deu-Latn German | ||
# deu-Latn-np German† | ||
# deu-Latn-nar German (more phonetic) | ||
# eng-Latn English‡ | ||
# fas-Arab Farsi (Perso-Arabic) | ||
# fra-Latn French | ||
# fra-Latn-np French† | ||
# hau-Latn Hausa | ||
# hin-Deva Hindi | ||
# hun-Latn Hungarian | ||
# ilo-Latn Ilocano | ||
# ind-Latn Indonesian | ||
# ita-Latn Italian | ||
# jav-Latn Javanese | ||
# kaz-Cyrl Kazakh (Cyrillic) | ||
# kaz-Latn Kazakh (Latin) | ||
# kin-Latn Kinyarwanda | ||
# kir-Arab Kyrgyz (Perso-Arabic) | ||
# kir-Cyrl Kyrgyz (Cyrillic) | ||
# kir-Latn Kyrgyz (Latin) | ||
# kmr-Latn Kurmanji | ||
# lao-Laoo Lao | ||
# mar-Deva Marathi | ||
# mlt-Latn Maltese | ||
# mya-Mymr Burmese | ||
# msa-Latn Malay | ||
# nld-Latn Dutch | ||
# nya-Latn Chichewa | ||
# orm-Latn Oromo | ||
# pan-Guru Punjabi (Eastern) | ||
# pol-Latn Polish | ||
# por-Latn Portuguese | ||
# ron-Latn Romanian | ||
# rus-Cyrl Russian | ||
# sna-Latn Shona | ||
# som-Latn Somali | ||
# spa-Latn Spanish | ||
# swa-Latn Swahili | ||
# swe-Latn Swedish | ||
# tam-Taml Tamil | ||
# tel-Telu Telugu | ||
# tgk-Cyrl Tajik | ||
# tgl-Latn Tagalog | ||
# tha-Thai Thai | ||
# tir-Ethi Tigrinya | ||
# tpi-Latn Tok Pisin | ||
# tuk-Cyrl Turkmen (Cyrillic) | ||
# tuk-Latn Turkmen (Latin) | ||
# tur-Latn Turkish (Latin) | ||
# ukr-Cyrl Ukranian | ||
# uig-Arab Uyghur (Perso-Arabic) | ||
# uzb-Cyrl Uzbek (Cyrillic) | ||
# uzb-Latn Uzbek (Latin) | ||
# vie-Latn Vietnamese | ||
# xho-Latn Xhosa | ||
# yor-Latn Yoruba | ||
# zul-Latn Zulu | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(prog="words2ipa.py") | ||
parser.add_argument("language", help="epitran language code (e.g., eng-Latn)") | ||
parser.add_argument( | ||
"--print-word", action="store_true", help="Print word before IPA" | ||
) | ||
parser.add_argument("--sep", help="Separator between IPA symbols (default: none)") | ||
args = parser.parse_args() | ||
|
||
e = Epitran(args.language) | ||
|
||
print("Reading words from stdin...", file=sys.stderr) | ||
for word in sys.stdin: | ||
word = word.strip() | ||
if word: | ||
if args.print_word: | ||
print(word, end=" ") | ||
|
||
ipa = e.trans_list(word) | ||
if args.sep: | ||
print(args.sep.join(ipa)) | ||
else: | ||
print("".join(ipa)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.