From aba4367b49750089e4e4122415a77cac43bd97bc Mon Sep 17 00:00:00 2001 From: Dirk Roorda Date: Thu, 9 Dec 2021 16:09:13 +0100 Subject: [PATCH] better metadata --- programs/phono.ipynb | 610 ++++++++++++++++++++------------------- programs/phono.py | 319 ++++++++++---------- tf/2021/otext@phono.tf | 6 +- tf/2021/phono.tf | 7 +- tf/2021/phono_trailer.tf | 7 +- yaml/generic.yaml | 3 + yaml/phono.yaml | 5 + 7 files changed, 486 insertions(+), 471 deletions(-) create mode 100644 yaml/generic.yaml create mode 100644 yaml/phono.yaml diff --git a/programs/phono.ipynb b/programs/phono.ipynb index bb23fe6..cc27d32 100644 --- a/programs/phono.ipynb +++ b/programs/phono.ipynb @@ -316,9 +316,11 @@ "import os\n", "import collections\n", "import re\n", + "import yaml\n", "import utils\n", "from tf.fabric import Fabric\n", - "from tf.writing.transcription import Transcription" + "from tf.writing.transcription import Transcription\n", + "from tf.core.helpers import formatMeta" ] }, { @@ -334,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "lines_to_next_cell": 2 }, @@ -383,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -394,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -403,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -413,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": { "lines_to_next_cell": 2 }, @@ -436,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": { "lines_to_next_cell": 2 }, @@ -463,7 +465,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -473,10 +475,10 @@ "..............................................................................................\n", ". 0.00s Load the existing TF dataset .\n", "..............................................................................................\n", - "This is Text-Fabric 8.5.13\n", + "This is Text-Fabric 9.1.7\n", "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", "\n", - "88 features found and 0 ignored\n" + "114 features found and 0 ignored\n" ] } ], @@ -487,14 +489,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": { "lines_to_next_cell": 2 }, @@ -505,7 +500,7 @@ "text": [ " 0.00s loading features ...\n", " | 0.00s Dataset without structure sections in otext:no structure functions in the T-API\n", - " 4.38s All features loaded/computed - for details use loadLog()\n" + " 10s All features loaded/computed - for details use TF.isLoaded()\n" ] }, { @@ -525,7 +520,7 @@ " ('Text', 'text', ('T Text',))]" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -572,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -593,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -637,7 +632,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -672,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -682,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 16, "metadata": { "lines_to_next_cell": 2 }, @@ -705,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -741,7 +736,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -750,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -796,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 20, "metadata": { "lines_to_next_cell": 2 }, @@ -841,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -913,7 +908,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 22, "metadata": { "lines_to_next_cell": 2 }, @@ -948,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -958,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -983,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -994,7 +989,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1007,7 +1002,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1020,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 28, "metadata": { "lines_to_next_cell": 2 }, @@ -1124,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -1135,7 +1130,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1147,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -1160,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1171,7 +1166,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -1193,7 +1188,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 34, "metadata": { "lines_to_next_cell": 2 }, @@ -1241,7 +1236,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -1250,7 +1245,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1260,7 +1255,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1271,7 +1266,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 38, "metadata": { "lines_to_next_cell": 2 }, @@ -1301,7 +1296,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1319,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1324,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1340,7 +1335,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1353,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1363,7 +1358,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1373,7 +1368,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1395,7 +1390,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1405,7 +1400,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1414,7 +1409,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 48, "metadata": { "lines_to_next_cell": 2 }, @@ -1435,7 +1430,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1444,7 +1439,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 50, "metadata": { "lines_to_next_cell": 2 }, @@ -1545,7 +1540,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -1557,7 +1552,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -1567,7 +1562,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -1579,7 +1574,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1590,7 +1585,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1600,7 +1595,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1619,7 +1614,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1628,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1638,7 +1633,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1654,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1665,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1688,7 +1683,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1697,7 +1692,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": { "lines_to_next_cell": 2 }, @@ -1729,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 64, "metadata": { "lines_to_next_cell": 2 }, @@ -1817,7 +1812,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1850,7 +1845,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1861,7 +1856,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1871,7 +1866,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1881,7 +1876,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1891,7 +1886,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1907,7 +1902,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1917,7 +1912,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1927,7 +1922,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 73, "metadata": { "lines_to_next_cell": 2 }, @@ -1948,7 +1943,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1958,7 +1953,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1970,7 +1965,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1980,7 +1975,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1991,7 +1986,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -2002,7 +1997,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -2012,7 +2007,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -2022,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": { "lines_to_next_cell": 2 }, @@ -2069,7 +2064,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -2087,7 +2082,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 83, "metadata": { "lines_to_next_cell": 2 }, @@ -2127,7 +2122,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -2136,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 85, "metadata": { "lines_to_next_cell": 2 }, @@ -2173,7 +2168,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -2182,7 +2177,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -2209,7 +2204,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -2222,7 +2217,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 89, "metadata": { "lines_to_next_cell": 2 }, @@ -2251,7 +2246,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -2263,13 +2258,6 @@ "]" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": { @@ -2286,7 +2274,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -2428,14 +2416,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 24, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -2535,14 +2516,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 25, + "execution_count": 93, "metadata": { "lines_to_next_cell": 2 }, @@ -2625,7 +2599,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 94, "metadata": { "lines_to_next_cell": 2 }, @@ -2722,7 +2696,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -2737,7 +2711,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -2752,7 +2726,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 97, "metadata": { "lines_to_next_cell": 2 }, @@ -2781,7 +2755,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -2791,7 +2765,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -2803,7 +2777,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -2813,7 +2787,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -2833,7 +2807,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -2856,7 +2830,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 103, "metadata": { "lines_to_next_cell": 2 }, @@ -2909,9 +2883,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| 11s \tLooking for non-verb qamets\n" + ] + } + ], "source": [ "# find lexemes which have an occurrence with a qamets (except verbs)\n", "utils.caption(0, \"\\tLooking for non-verb qamets\")\n", @@ -2921,7 +2903,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 105, "metadata": { "lines_to_next_cell": 2 }, @@ -2930,8 +2912,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "| 48s \tLooking for non-verb qamets\n", - "| 50s \t4058 lexemes and 13452 unique occurrences\n" + "| 13s \t4056 lexemes and 13451 unique occurrences\n" ] } ], @@ -2968,7 +2949,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 106, "metadata": { "lines_to_next_cell": 2 }, @@ -2977,8 +2958,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "| 55s \tFiltering lexemes with varied occurrences\n", - "| 56s \t161 interesting lexemes with 1704 unique occurrences\n" + "| 13s \tFiltering lexemes with varied occurrences\n", + "| 13s \t161 interesting lexemes with 1704 unique occurrences\n" ] } ], @@ -3029,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -3047,7 +3028,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -3094,7 +3075,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -3110,7 +3091,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 110, "metadata": { "lines_to_next_cell": 2 }, @@ -3143,7 +3124,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 111, "metadata": { "lines_to_next_cell": 2 }, @@ -3152,14 +3133,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "| 1m 07s \tGuessing between gadol and qatan\n", + "| 13s \tGuessing between gadol and qatan\n", "\tJM/: Override for syllable 1: ā becomes o\n", "\tBJT/: Override for syllable 1: o becomes ā\n", "\tJWMM: Override for syllable 2: becomes ā\n", "\tJHWNTN/: Override for syllable 2: becomes ā\n", "\tJRB(?!(?:[/!]|{v}))".format(v=vowel)) -# In[ ]: +# In[75]: # final mater lectionis @@ -1460,14 +1456,14 @@ def dages_forte_repl(match): last_ml_jw = re.compile(r"jw(?=[ &-]|\Z)") -# In[ ]: +# In[76]: # mappiq heh mappiq_heh = re.compile(r"h\.") -# In[ ]: +# In[77]: fixit_i = re.compile(r"([{v}])\.".format(v=complex_i_vowel)) @@ -1475,7 +1471,7 @@ def dages_forte_repl(match): fixit = re.compile(r"(.)\.") -# In[ ]: +# In[78]: split_sep = re.compile( @@ -1483,21 +1479,21 @@ def dages_forte_repl(match): ) # to split the result in the phono part and the interword part -# In[ ]: +# In[79]: def fixit_repl(match): return match.group(1) * 2 -# In[ ]: +# In[80]: def fixit_i_repl(match): return match.group(1) + "j" -# In[ ]: +# In[81]: def fixit_w_repl(match): @@ -1525,7 +1521,7 @@ def fixit_w_repl(match): # # The ``phono()`` function that carries out the complete transliteration, looks by default in ``qamets_corrections``, but this can be overridden. These corrections will not be carried out for the special verb cases. -# In[ ]: +# In[82]: qamets_corrections = {} # list of translits that must be corrected @@ -1533,7 +1529,7 @@ def fixit_w_repl(match): # apply correction instructions to a word -# In[19]: +# In[83]: def apply_corr(wordq, corr): @@ -1561,13 +1557,13 @@ def apply_corr(wordq, corr): # # We need concise, normalized values for the lexical features. -# In[ ]: +# In[84]: undefs = {"NA", "unknown", "n/a", "absent"} -# In[20]: +# In[85]: png = dict( @@ -1592,13 +1588,13 @@ def apply_corr(wordq, corr): # # We need a label for lexical information such as part of speech, person, number, gender. -# In[ ]: +# In[86]: declensed = {"subs", "nmpr", "adjv", "prps", "prde", "prin"} -# In[ ]: +# In[87]: def get_lex_info(w): @@ -1622,7 +1618,7 @@ def get_lex_info(w): return lex_info -# In[ ]: +# In[88]: def get_decl(lex_info): @@ -1632,7 +1628,7 @@ def get_decl(lex_info): return lex_info if len(parts) == 1 else parts[0] -# In[21]: +# In[89]: def get_prs(lex_info): @@ -1649,7 +1645,7 @@ def get_prs(lex_info): # # ## Phono parts -# In[22]: +# In[90]: interesting_stats = [ @@ -1660,12 +1656,6 @@ def get_prs(lex_info): ] -# In[ ]: - - - - - # if suppress_in_verb, phono will suppress qatan interpretation in certain verb paradigmatic forms # if suppress_in_prs, phono will suppress qatan interpreation in pronominal suffixes # if correct is 1, phono will apply individual corrections @@ -1673,7 +1663,7 @@ def get_prs(lex_info): # if correct is -1, phono will stop just before applying the qamets qatan corrections and return # the intermediate result -# In[23]: +# In[91]: def phono_qamets( @@ -1812,13 +1802,7 @@ def phono_qamets( return (result, False) -# In[ ]: - - - - - -# In[24]: +# In[92]: def phono_patterns(result, debug, count, dout): @@ -1915,13 +1899,7 @@ def phono_patterns(result, debug, count, dout): return result -# In[ ]: - - - - - -# In[25]: +# In[93]: def phono_symbols(ws, result, debug, count, dout): @@ -1992,7 +1970,7 @@ def phono_symbols(ws, result, debug, count, dout): # ## Phono whole # Here the rule fabrics are woven together, exceptions invoked. -# In[26]: +# In[94]: def phono( @@ -2072,7 +2050,7 @@ def phono( # to the number of consonants found in the paradigmatic material. # This is rather crude, but it will do. -# In[ ]: +# In[95]: # we need the number of letters in a defined value of a morpho feature @@ -2084,7 +2062,7 @@ def len_suffix(v): return len(v.replace("=", "").replace("W", "").replace("J", "")) -# In[ ]: +# In[96]: # we need a function that return 1 for plural/dual subs/adj and for fem adj @@ -2096,7 +2074,7 @@ def len_ending(sp, n, g): return 0 -# In[27]: +# In[97]: # return the number of consonants in the suffixes @@ -2113,14 +2091,14 @@ def len_morpho(w): # # Next, we reduce the vowel skeleton to a skeleton pattern. We are not interested in all vowels, only in whether the vowel is a qamets (gadol or qatan), A-like, O-like, or other (which we dub E-like). -# In[ ]: +# In[98]: # the qamets gadol/qatan skeleton qamets_qatan_skel = re.compile("([^@^])") -# In[ ]: +# In[99]: # the vowel skeleton where the qamets gadol/qatan are preserved as @ and ^ @@ -2129,14 +2107,14 @@ def len_morpho(w): silent_alef_start = re.compile(r"([ &-]|\A)>([!/]?(?:[^!/.:;@^aeiou]|\Z))") -# In[ ]: +# In[100]: def silent_alef_start_repl(match): return match.group(1) + "E" + match.group(2) -# In[ ]: +# In[101]: qamets_qatan_fullskel = re.compile( @@ -2153,7 +2131,7 @@ def silent_alef_start_repl(match): ) -# In[ ]: +# In[102]: def qamets_qatan_fullskel_repl(match): @@ -2173,7 +2151,7 @@ def qamets_qatan_fullskel_repl(match): return "" -# In[28]: +# In[103]: def get_full_skel(w, debug=False): @@ -2209,7 +2187,7 @@ def get_full_skel(w, debug=False): # # ### All candidates -# In[ ]: +# In[104]: # find lexemes which have an occurrence with a qamets (except verbs) @@ -2218,7 +2196,7 @@ def get_full_skel(w, debug=False): qq_lex = collections.defaultdict(lambda: []) -# In[29]: +# In[105]: for w in F.otype.s("word"): @@ -2244,7 +2222,7 @@ def get_full_skel(w, debug=False): # ### Filtering interesting candidates -# In[30]: +# In[106]: utils.caption(0, "\tFiltering lexemes with varied occurrences") @@ -2284,7 +2262,7 @@ def get_full_skel(w, debug=False): # ### Guess the qamets -# In[ ]: +# In[107]: qamets_qatan_xc = dict( @@ -2299,7 +2277,7 @@ def get_full_skel(w, debug=False): qamets_qatan_xcompiled[lex][pos] = ins -# In[ ]: +# In[108]: def compile_occs(lex, occs): @@ -2343,7 +2321,7 @@ def compile_occs(lex, occs): return occs_compiled -# In[ ]: +# In[109]: def guess_qq(occ, occs_compiled, debug=False): @@ -2356,7 +2334,7 @@ def guess_qq(occ, occs_compiled, debug=False): return guess -# In[31]: +# In[110]: def get_corr(fullskel, guess, debug=False): @@ -2377,7 +2355,7 @@ def get_corr(fullskel, guess, debug=False): # ### Carrying out the guess work -# In[32]: +# In[111]: utils.caption(0, "\tGuessing between gadol and qatan") @@ -2427,27 +2405,27 @@ def get_corr(fullskel, guess, debug=False): # # Generate phonological data -# In[ ]: +# In[112]: def stats_prog(): return " ".join(str(stats.get(stat, 0)) for stat in interesting_stats) -# In[ ]: +# In[113]: utils.caption(4, "Generating data in two ways ... ") -# In[ ]: +# In[114]: phono_file = [] word_file = [] -# In[ ]: +# In[115]: stats = collections.Counter() @@ -2476,7 +2454,7 @@ def stats_prog(): word_file.append((None, "", "+")) -# In[33]: +# In[116]: utils.caption(0, "\t{:>5} verses done {}".format(nv, stats_prog())) @@ -2499,7 +2477,7 @@ def stats_prog(): # # They should be consistent. -# In[ ]: +# In[117]: utils.caption(0, "{} items in phono".format(len(phono_file))) @@ -2514,7 +2492,7 @@ def stats_prog(): utils.caption(0, "\t{} lines".format(i)) -# In[35]: +# In[118]: phono_text = "".join(phono_file) @@ -2533,7 +2511,22 @@ def stats_prog(): # We also generate a config feature `otext@phono`, which will be picked up by Text-Fabric automatically. # In it we define the phonetic *format*, so that Text-Fabric has can output text in phonetic representation. -# In[36]: +# In[122]: + + +genericMetaPath = f"{thisRepo}/yaml/generic.yaml" +phonoMetaPath = f"{thisRepo}/yaml/phono.yaml" + +with open(genericMetaPath) as fh: + genericMeta = yaml.load(fh, Loader=yaml.FullLoader) + genericMeta["version"] = VERSION +with open(phonoMetaPath) as fh: + phonoMeta = formatMeta(yaml.load(fh, Loader=yaml.FullLoader)) + +metaData = {"": genericMeta, **phonoMeta} + + +# In[124]: utils.caption(4, "Writing TF phono features") @@ -2542,22 +2535,14 @@ def stats_prog(): phono_trailer=dict(((ln[0], ln[2]) for ln in word_file if ln[0] is not None)), ) edgeFeatures = {} -provenance = dict( - source="Phono Notebook applied to BHSA Data", - coreData="BHSA", - coreVersion=VERSION, - author="BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda", -) -metaData = { - "": provenance, - "otext@phono": { - "about": "Provides phonetic transcriptions to Hebrew Words", - "see": "https://github.com/ETCBC/phono", - "fmt:text-phono-full": "{phono}{phono_trailer}", - }, - "phono": dict(valueType="str"), - "phono_trailer": dict(valueType="str"), +metaData["otext@phono"] = { + "about": "Provides phonetic transcriptions to Hebrew Words", + "see": "https://github.com/ETCBC/phono", + "fmt:text-phono-full": "{phono}{phono_trailer}", } +metaData["phono"]["valueType"] = "str" +metaData["phono_trailer"]["valueType"] = "str" + TF = Fabric(locations=thisTempTf, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData) @@ -2566,7 +2551,7 @@ def stats_prog(): # # Check differences with previous versions. -# In[37]: +# In[125]: utils.checkDiffs(thisTempTf, thisTf, only=set(nodeFeatures)) @@ -2576,7 +2561,7 @@ def stats_prog(): # # Copy the new TF features from the temporary location where they have been created to their final destination. -# In[38]: +# In[126]: utils.deliverDataset(thisTempTf, thisTf) @@ -2584,13 +2569,13 @@ def stats_prog(): # # Compile TF -# In[ ]: +# In[127]: utils.caption(4, "Load and compile the new TF features") -# In[39]: +# In[128]: TF = Fabric(locations=[coreTf, thisTf], modules=[""]) @@ -2598,19 +2583,13 @@ def stats_prog(): api.makeAvailableIn(globals()) -# In[ ]: - - - - - -# In[ ]: +# In[129]: utils.caption(4, "Basic tests") -# In[ ]: +# In[130]: utils.caption(4, "First verses in phonetic transcription") @@ -2619,7 +2598,7 @@ def stats_prog(): utils.caption(0, T.text(L.d(v, "word"), fmt="text-phono-full"), continuation=True) -# In[40]: +# In[131]: utils.caption(4, "First verse in all formats") diff --git a/tf/2021/otext@phono.tf b/tf/2021/otext@phono.tf index 55670dd..f46f484 100644 --- a/tf/2021/otext@phono.tf +++ b/tf/2021/otext@phono.tf @@ -2,10 +2,10 @@ @about=Provides phonetic transcriptions to Hebrew Words @author=BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda @coreData=BHSA -@coreVersion=2021 @fmt:text-phono-full={phono}{phono_trailer} +@provenance=computed by the phono notebook, see https://github.com/ETCBC/phono @see=https://github.com/ETCBC/phono -@source=Phono Notebook applied to BHSA Data +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:33:30Z +@dateWritten=2021-12-09T14:25:56Z diff --git a/tf/2021/phono.tf b/tf/2021/phono.tf index 1bc3559..9a4d907 100644 --- a/tf/2021/phono.tf +++ b/tf/2021/phono.tf @@ -1,11 +1,12 @@ @node @author=BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda @coreData=BHSA -@coreVersion=2021 -@source=Phono Notebook applied to BHSA Data +@description=🆗 phonological transcription (bᵊ rēšˌîṯ bārˈā ʔᵉlōhˈîm) +@provenance=computed by the phono notebook, see https://github.com/ETCBC/phono @valueType=str +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:33:29Z +@dateWritten=2021-12-09T14:25:55Z bᵊ rēšˌîṯ diff --git a/tf/2021/phono_trailer.tf b/tf/2021/phono_trailer.tf index a39cd6d..a42b4a3 100644 --- a/tf/2021/phono_trailer.tf +++ b/tf/2021/phono_trailer.tf @@ -1,11 +1,12 @@ @node @author=BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda @coreData=BHSA -@coreVersion=2021 -@source=Phono Notebook applied to BHSA Data +@description=🆗 interword material in phonological transcription +@provenance=computed by the phono notebook, see https://github.com/ETCBC/phono @valueType=str +@version=2021 @writtenBy=Text-Fabric -@dateWritten=2021-11-30T15:33:29Z +@dateWritten=2021-12-09T14:25:55Z diff --git a/yaml/generic.yaml b/yaml/generic.yaml new file mode 100644 index 0000000..ae01558 --- /dev/null +++ b/yaml/generic.yaml @@ -0,0 +1,3 @@ +provenance: computed by the phono notebook, see https://github.com/ETCBC/phono +coreData: BHSA +author: "BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda" diff --git a/yaml/phono.yaml b/yaml/phono.yaml new file mode 100644 index 0000000..12c9fd4 --- /dev/null +++ b/yaml/phono.yaml @@ -0,0 +1,5 @@ +phono: + desc: 🆗 phonological transcription + eg: bᵊ rēšˌîṯ bārˈā ʔᵉlōhˈîm +phono_trailer: + desc: 🆗 interword material in phonological transcription