From d57432d182dbb9ea3ddda62efd235e5846a6bcb7 Mon Sep 17 00:00:00 2001 From: neurlang <77860779+neurlang@users.noreply.github.com> Date: Sat, 28 Dec 2024 19:14:32 +0100 Subject: [PATCH] reverse learning --- cmd/analysis/clean_language.sh | 13 +++++++++++-- cmd/analysis/clean_language_reverse.sh | 2 ++ cmd/analysis/creator.sh | 11 ++++++++++- cmd/analysis/main.go | 5 +++++ cmd/analysis/mutator.sh | 13 +++++++++++-- cmd/analysis/remover.sh | 11 ++++++++++- cmd/analysis/study_language_reverse.sh | 2 ++ cmd/analysis/train_language.sh | 2 +- cmd/analysis/train_language_reverse.sh | 5 +++++ 9 files changed, 57 insertions(+), 7 deletions(-) create mode 100755 cmd/analysis/clean_language_reverse.sh create mode 100755 cmd/analysis/study_language_reverse.sh create mode 100755 cmd/analysis/train_language_reverse.sh diff --git a/cmd/analysis/clean_language.sh b/cmd/analysis/clean_language.sh index c40a371..4f26234 100755 --- a/cmd/analysis/clean_language.sh +++ b/cmd/analysis/clean_language.sh @@ -1,9 +1,18 @@ #!/bin/bash +# Initialize a reverse flag +reverse_flag="" +for arg in "$@"; do + if [[ "$arg" == "--reverse" ]]; then + reverse_flag="_reverse" + break + fi +done + analysis_script="./analysis" -original_json="../../dicts/$1/language.json" +original_json="../../dicts/$1/language$reverse_flag.json" srcfile="../../dicts/$1/dirty.tsv" -dstfile="../../dicts/$1/clean.tsv" +dstfile="../../dicts/$1/clean$reverse_flag.tsv" $analysis_script --target 9999999999999 --lang "$original_json" --srcfile "$srcfile" --dstfile "$dstfile" -loss -nospaced -noipadash $2 $3 $4 $5 $6 $7 $8 $9 diff --git a/cmd/analysis/clean_language_reverse.sh b/cmd/analysis/clean_language_reverse.sh new file mode 100755 index 0000000..c8672df --- /dev/null +++ b/cmd/analysis/clean_language_reverse.sh @@ -0,0 +1,2 @@ +#!/bin/bash +./clean_language.sh $1 $2 $3 $4 $5 $6 $7 $8 $9 --reverse diff --git a/cmd/analysis/creator.sh b/cmd/analysis/creator.sh index 784c3a2..81e1265 100755 --- a/cmd/analysis/creator.sh +++ b/cmd/analysis/creator.sh @@ -1,7 +1,16 @@ #!/bin/bash +# Initialize a reverse flag +reverse_flag="" +for arg in "$@"; do + if [[ "$arg" == "--reverse" ]]; then + reverse_flag="_reverse" + break + fi +done + random=$(shuf -i 1-100000 -n 1) -original_json="../../dicts/$2/language.json" +original_json="../../dicts/$2/language$reverse_flag.json" mutated_json="/tmp/language_mutated.$random.json" analysis_script="./analysis" srcfile="../../dicts/$2/dirty.tsv" diff --git a/cmd/analysis/main.go b/cmd/analysis/main.go index cbed113..5a58211 100644 --- a/cmd/analysis/main.go +++ b/cmd/analysis/main.go @@ -177,6 +177,7 @@ func main() { nospaced := flag.Bool("nospaced", false, "delete spacing") padspace := flag.Bool("padspace", false, "insert space to the end of target word in case of a spaceless written language") matrices := flag.Bool("matrices", false, "show edit matrices") + reverse := flag.Bool("reverse", false, "reverse translation (swap source and target languages)") escapeunicode := flag.Bool("escapeunicode", false, "escape unicode when viewing") normalize := flag.String("normalize", "", "normalize unicode, for instance to NFC") deleteval := flag.Bool("deleteval", false, "delete one value") @@ -567,6 +568,10 @@ func main() { var threeways = make(map[string]uint64) loop(*srcFile, 200, func(word1, word2 string) { + + if reverse != nil && *reverse { + word1, word2 = word2, word1 + } if randsubs != nil && *randsubs != 0 { if rand.Intn(1+*randsubs) != 0 { diff --git a/cmd/analysis/mutator.sh b/cmd/analysis/mutator.sh index 626fc35..f1d33e0 100755 --- a/cmd/analysis/mutator.sh +++ b/cmd/analysis/mutator.sh @@ -1,8 +1,17 @@ #!/bin/bash +# Initialize a reverse flag +reverse_flag="" +for arg in "$@"; do + if [[ "$arg" == "--reverse" ]]; then + reverse_flag="_reverse" + break + fi +done + # Paths to the files -mutations_file="../../dicts/$1/$2.language.json" -original_json="../../dicts/$1/language.json" +mutations_file="../../dicts/$1/$2.language$reverse_flag.json" +original_json="../../dicts/$1/language$reverse_flag.json" mutated_json="/tmp/language_mutated.json" analysis_script="./analysis" srcfile="../../dicts/$1/dirty.tsv" diff --git a/cmd/analysis/remover.sh b/cmd/analysis/remover.sh index 6661f37..a911d9c 100755 --- a/cmd/analysis/remover.sh +++ b/cmd/analysis/remover.sh @@ -1,8 +1,17 @@ #!/bin/bash +# Initialize a reverse flag +reverse_flag="" +for arg in "$@"; do + if [[ "$arg" == "--reverse" ]]; then + reverse_flag="_reverse" + break + fi +done + # Paths to the files random=$(shuf -i 1-100000 -n 1) -original_json="../../dicts/$2/language.json" +original_json="../../dicts/$2/language$reverse_flag.json" mutated_json="/tmp/language_mutated.$random.json" analysis_script="./analysis" srcfile="../../dicts/$2/dirty.tsv" diff --git a/cmd/analysis/study_language_reverse.sh b/cmd/analysis/study_language_reverse.sh new file mode 100755 index 0000000..e0f0485 --- /dev/null +++ b/cmd/analysis/study_language_reverse.sh @@ -0,0 +1,2 @@ +#!/bin/bash +./study_language.sh $1 $2 $3 $4 $5 $6 $7 $8 $9 --reverse diff --git a/cmd/analysis/train_language.sh b/cmd/analysis/train_language.sh index d1f61eb..d59e85a 100755 --- a/cmd/analysis/train_language.sh +++ b/cmd/analysis/train_language.sh @@ -2,4 +2,4 @@ #train -../../../classifier/cmd/train_phonemizer/train_phonemizer --cleantsv ../../dicts/$1/clean.tsv --dstmodel ../../dicts/$1/weights1.json.lzw $2 +../../../classifier/cmd/train_phonemizer/train_phonemizer --cleantsv ../../dicts/$1/clean.tsv --dstmodel ../../dicts/$1/weights1.json.lzw $2 $3 $4 $5 $6 diff --git a/cmd/analysis/train_language_reverse.sh b/cmd/analysis/train_language_reverse.sh new file mode 100755 index 0000000..b1b7ca1 --- /dev/null +++ b/cmd/analysis/train_language_reverse.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +#train + +../../../classifier/cmd/train_phonemizer/train_phonemizer --cleantsv ../../dicts/$1/clean_reverse.tsv --dstmodel ../../dicts/$1/weights1_reverse.json.lzw $2 $3 $4 $5 $6