Update scripts

synesthesiam · Jul 15, 2020 · 3bc1560 · 3bc1560
1 parent 7fcef89
commit 3bc1560
Show file tree

Hide file tree

Showing 11 changed files with 683 additions and 21 deletions.
diff --git a/__main__.py b/__main__.py
@@ -30,7 +30,13 @@
 async def download_raw(path: str) -> Response:
     components = path.split("/")
     profile = components[0]
-    artifact = "/".join(components[3:])
+
+    if components[1] == "raw":
+        # /raw/master/{file}
+        artifact = "/".join(components[3:])
+    else:
+        # /{commit}/{file}
+        artifact = "/".join(components[2:])
 
     profile_dir = profile_dirs.get(profile)
     assert profile_dir, f"Missing directory for {profile}"

diff --git a/bin/align_phonemes.py b/bin/align_phonemes.py
@@ -3,10 +3,9 @@
 Attempts to guess the correspondence between a speech system's phonemes and some
 other phoneme set (e.g., IPA or eSpeak).
 
-Requires 3 text files:
+Requires 2 text files:
 * phonemes - <word> <phoneme1> <phoneme2> ...
 * other - <word> <phoneme1> <phoneme2> ...
-* examples - <phoneme> <word> <phoneme1> <phoneme2> ...
 
 Alignment is done in 2 passes:
 
@@ -44,13 +43,14 @@ def main():
     )
     parser.add_argument("other", help="Path to file with words and other phonemes")
     parser.add_argument(
-        "examples", help="Path to file with speech system phoneme examples"
+        "--examples", help="Path to file with speech system phoneme examples"
     )
     parser.add_argument(
         "--missing", default="?", help="String to print for missing phonemes"
     )
     args = parser.parse_args()
 
+    all_phonemes: typing.Set[str] = set()
     words: typing.Dict[str, Word] = {}
 
     # Load speech system phonemes
@@ -59,19 +59,26 @@ def main():
             line = line.strip()
             if line:
                 word_str, *phonemes = line.split()
+                if not phonemes:
+                    continue
+
                 word = words.get(word_str)
                 if not word:
                     word = Word(word_str)
                     words[word_str] = word
 
                 word.phonemes = [clean(p) for p in phonemes]
+                all_phonemes.update(word.phonemes)
 
     # Load other phonemes
     with open(args.other, "r") as other_file:
         for line in other_file:
             line = line.strip()
             if line:
                 word_str, *others = line.split()
+                if not others:
+                    continue
+
                 word = words.get(word_str)
                 if not word:
                     word = Word(word_str)
@@ -80,16 +87,16 @@ def main():
                 word.other = [clean(o) for o in others]
 
     # Load phoneme examples
-    all_phonemes: typing.Set[str] = set()
     phoneme_example: typing.Dict[str, str] = {}
 
-    with open(args.examples, "r") as examples_file:
-        for line in examples_file:
-            line = line.strip()
-            if line:
-                phoneme, example, *phonemes = line.split()
-                all_phonemes.add(phoneme)
-                phoneme_example[phoneme] = example
+    if args.examples:
+        with open(args.examples, "r") as examples_file:
+            for line in examples_file:
+                line = line.strip()
+                if line:
+                    phoneme, example, *phonemes = line.split()
+                    all_phonemes.add(phoneme)
+                    phoneme_example[phoneme] = example
 
     # -------------------------------------------------------------------------
 
@@ -118,6 +125,7 @@ def main():
     # Assume unassigned phonemes map to two "others".
     assigned_others = set(assignments.values())
     unassigned = all_phonemes - set(assignments)
+
     if unassigned:
         for word in words.values():
             if len(word.other) > len(word.phonemes):
@@ -144,11 +152,13 @@ def main():
 
     # Print candidates and ? for phonemes with no candidates
     for phoneme in sorted(all_phonemes):
-        print(
-            phoneme,
-            assignments.get(phoneme, args.missing),
-            phoneme_example.get(phoneme, ""),
-        )
+        print(phoneme, assignments.get(phoneme, args.missing), end="")
+
+        if args.examples:
+            print(" ", phoneme_example.get(phoneme, ""))
+        else:
+            # End line
+            print("")
 
 
 # -----------------------------------------------------------------------------

diff --git a/bin/check_profiles.py b/bin/check_profiles.py
@@ -8,7 +8,10 @@
 
 from utils import load_profile
 
+
 def main():
+    print("Reading YAML paths from stdin...", file=sys.stderr)
+
     for yml_path_str in sys.stdin:
         yml_path = Path(yml_path_str.strip())
         profile_dir = yml_path.parent
@@ -69,6 +72,7 @@ def main():
 
 # -----------------------------------------------------------------------------
 
+
 def path_exists(path):
     if not path.exists():
         # Try .gz
@@ -81,6 +85,7 @@ def path_exists(path):
 
     return True
 
+
 # -----------------------------------------------------------------------------
 
 if __name__ == "__main__":

diff --git a/bin/get_frequent_words.py b/bin/get_frequent_words.py
@@ -8,11 +8,14 @@
 """
 import argparse
 import heapq
+import re
 import sys
 
 # Ignore start/stop sentence tokens
 _IGNORE_WORDS = set(["<s>", "</s>"])
 
+_IGNORE_REGEX = re.compile(r"^[0-9,.%-]+$")
+
 
 def main():
     parser = argparse.ArgumentParser()
@@ -45,6 +48,10 @@ def main():
                 # Skip empty or ignored words
                 continue
 
+            # Skip numbers, etc.
+            if _IGNORE_REGEX.match(word):
+                continue
+
             if len(frequent_words) < args.n:
                 # Append to heap
                 heapq.heappush(frequent_words, (prob, word))

diff --git a/bin/make_g2p_align.sh b/bin/make_g2p_align.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -e
+
+# Creates a grapheme-to-phoneme alignment corpus from a pronunciation dictionary
+# using phonetiaurus.
+
+if [[ -z "$2" ]]; then
+    echo "Usage: make-g2p_align.sh DICTIONARY_IN CORPUS_OUT"
+    exit 1
+fi
+
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+corpus_path="$(realpath "$2")"
+
+# -----------------------------------------------------------------------------
+
+temp_dir="$(mktemp -d)"
+function finish {
+    rm -rf "${temp_dir}"
+}
+
+trap finish EXIT
+
+# -----------------------------------------------------------------------------
+
+if [[ "$1" == '-' ]]; then
+    # Read from stdin into temporary file
+    dict_path="${temp_dir}/unformatted.dict"
+    cat > "${dict_path}"
+else
+    dict_path="$(realpath "$1")"
+fi
+
+# Format dictionary for phonetisaurus
+cd "${temp_dir}"
+perl -pe 's/\([0-9]+\)//;
+            s/[ ]+/ /g; s/^[ ]+//;
+            s/[ ]+$//; @_ = split (/[ ]+/);
+            $w = shift (@_);
+            $_ = $w."\t".join (" ", @_)."\n";' < "${dict_path}" | sed -e '/[_|\xA0]/d' > formatted.dict
+
+# Generate g2p corpus
+"${this_dir}/phonetisaurus-alignment" --lexicon formatted.dict --seq2_del --verbose
+
+# Copy out of temporary directory
+cp train/model.corpus "${corpus_path}"