Skip to content

Commit

Permalink
Update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Hansen committed Jul 15, 2020
1 parent 7fcef89 commit 3bc1560
Show file tree
Hide file tree
Showing 11 changed files with 683 additions and 21 deletions.
8 changes: 7 additions & 1 deletion __main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
async def download_raw(path: str) -> Response:
components = path.split("/")
profile = components[0]
artifact = "/".join(components[3:])

if components[1] == "raw":
# /raw/master/{file}
artifact = "/".join(components[3:])
else:
# /{commit}/{file}
artifact = "/".join(components[2:])

profile_dir = profile_dirs.get(profile)
assert profile_dir, f"Missing directory for {profile}"
Expand Down
42 changes: 26 additions & 16 deletions bin/align_phonemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
Attempts to guess the correspondence between a speech system's phonemes and some
other phoneme set (e.g., IPA or eSpeak).
Requires 3 text files:
Requires 2 text files:
* phonemes - <word> <phoneme1> <phoneme2> ...
* other - <word> <phoneme1> <phoneme2> ...
* examples - <phoneme> <word> <phoneme1> <phoneme2> ...
Alignment is done in 2 passes:
Expand Down Expand Up @@ -44,13 +43,14 @@ def main():
)
parser.add_argument("other", help="Path to file with words and other phonemes")
parser.add_argument(
"examples", help="Path to file with speech system phoneme examples"
"--examples", help="Path to file with speech system phoneme examples"
)
parser.add_argument(
"--missing", default="?", help="String to print for missing phonemes"
)
args = parser.parse_args()

all_phonemes: typing.Set[str] = set()
words: typing.Dict[str, Word] = {}

# Load speech system phonemes
Expand All @@ -59,19 +59,26 @@ def main():
line = line.strip()
if line:
word_str, *phonemes = line.split()
if not phonemes:
continue

word = words.get(word_str)
if not word:
word = Word(word_str)
words[word_str] = word

word.phonemes = [clean(p) for p in phonemes]
all_phonemes.update(word.phonemes)

# Load other phonemes
with open(args.other, "r") as other_file:
for line in other_file:
line = line.strip()
if line:
word_str, *others = line.split()
if not others:
continue

word = words.get(word_str)
if not word:
word = Word(word_str)
Expand All @@ -80,16 +87,16 @@ def main():
word.other = [clean(o) for o in others]

# Load phoneme examples
all_phonemes: typing.Set[str] = set()
phoneme_example: typing.Dict[str, str] = {}

with open(args.examples, "r") as examples_file:
for line in examples_file:
line = line.strip()
if line:
phoneme, example, *phonemes = line.split()
all_phonemes.add(phoneme)
phoneme_example[phoneme] = example
if args.examples:
with open(args.examples, "r") as examples_file:
for line in examples_file:
line = line.strip()
if line:
phoneme, example, *phonemes = line.split()
all_phonemes.add(phoneme)
phoneme_example[phoneme] = example

# -------------------------------------------------------------------------

Expand Down Expand Up @@ -118,6 +125,7 @@ def main():
# Assume unassigned phonemes map to two "others".
assigned_others = set(assignments.values())
unassigned = all_phonemes - set(assignments)

if unassigned:
for word in words.values():
if len(word.other) > len(word.phonemes):
Expand All @@ -144,11 +152,13 @@ def main():

# Print candidates and ? for phonemes with no candidates
for phoneme in sorted(all_phonemes):
print(
phoneme,
assignments.get(phoneme, args.missing),
phoneme_example.get(phoneme, ""),
)
print(phoneme, assignments.get(phoneme, args.missing), end="")

if args.examples:
print(" ", phoneme_example.get(phoneme, ""))
else:
# End line
print("")


# -----------------------------------------------------------------------------
Expand Down
5 changes: 5 additions & 0 deletions bin/check_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@

from utils import load_profile


def main():
print("Reading YAML paths from stdin...", file=sys.stderr)

for yml_path_str in sys.stdin:
yml_path = Path(yml_path_str.strip())
profile_dir = yml_path.parent
Expand Down Expand Up @@ -69,6 +72,7 @@ def main():

# -----------------------------------------------------------------------------


def path_exists(path):
if not path.exists():
# Try .gz
Expand All @@ -81,6 +85,7 @@ def path_exists(path):

return True


# -----------------------------------------------------------------------------

if __name__ == "__main__":
Expand Down
7 changes: 7 additions & 0 deletions bin/get_frequent_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
"""
import argparse
import heapq
import re
import sys

# Ignore start/stop sentence tokens
_IGNORE_WORDS = set(["<s>", "</s>"])

_IGNORE_REGEX = re.compile(r"^[0-9,.%-]+$")


def main():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -45,6 +48,10 @@ def main():
# Skip empty or ignored words
continue

# Skip numbers, etc.
if _IGNORE_REGEX.match(word):
continue

if len(frequent_words) < args.n:
# Append to heap
heapq.heappush(frequent_words, (prob, word))
Expand Down
47 changes: 47 additions & 0 deletions bin/make_g2p_align.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -e

# Creates a grapheme-to-phoneme alignment corpus from a pronunciation dictionary
# using phonetiaurus.

if [[ -z "$2" ]]; then
echo "Usage: make-g2p_align.sh DICTIONARY_IN CORPUS_OUT"
exit 1
fi

this_dir="$( cd "$( dirname "$0" )" && pwd )"

corpus_path="$(realpath "$2")"

# -----------------------------------------------------------------------------

temp_dir="$(mktemp -d)"
function finish {
rm -rf "${temp_dir}"
}

trap finish EXIT

# -----------------------------------------------------------------------------

if [[ "$1" == '-' ]]; then
# Read from stdin into temporary file
dict_path="${temp_dir}/unformatted.dict"
cat > "${dict_path}"
else
dict_path="$(realpath "$1")"
fi

# Format dictionary for phonetisaurus
cd "${temp_dir}"
perl -pe 's/\([0-9]+\)//;
s/[ ]+/ /g; s/^[ ]+//;
s/[ ]+$//; @_ = split (/[ ]+/);
$w = shift (@_);
$_ = $w."\t".join (" ", @_)."\n";' < "${dict_path}" | sed -e '/[_|\xA0]/d' > formatted.dict

# Generate g2p corpus
"${this_dir}/phonetisaurus-alignment" --lexicon formatted.dict --seq2_del --verbose

# Copy out of temporary directory
cp train/model.corpus "${corpus_path}"
Loading

0 comments on commit 3bc1560

Please sign in to comment.