Support both iso639-3 codes and BCP-47 language tags (#3060)

* Add support for iso639-3 language codes * Add support for retired language codes * Move langnames.py to the top-level * Add langcode() function * Add iso639retired dictionary * Improve wrapper functions * Add module docstring with doctest * Add 2-letter language codes * Add regular expression check * Improve inverse lookup of retired codes * Support BCP-47 * Avoid deprecated langcodes * Set stack level for warnings to warn on the langname call Now it throws e.g. ``` ...\nltk_3060.py:9: UserWarning: Shortening 'smo' to 'sm' print(f"{lang}: {langname(code)}") ``` Rather than ``` ...\nltk\langnames.py:64: UserWarning: Shortening zha to za warn(f"Shortening {code} to {code2}") ``` * Dict key membership is equivalent to dict membership * Resolve bug: subtag -> tag * Capitalize BCP47 in CorpusReader name * Reimplement removed type hint changes from #3081 Co-authored-by: Tom Aarsen <Cubiegamedev@gmail.com>
nltk · Dec 7, 2022 · f019fbe · f019fbe
1 parent 3ca43e2
commit f019fbe
Show file tree

Hide file tree

Showing 4 changed files with 953 additions and 0 deletions.
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -74,6 +74,9 @@
 alpino: AlpinoCorpusReader = LazyCorpusLoader(
     "alpino", AlpinoCorpusReader, tagset="alpino"
 )
+bcp47: BCP47CorpusReader = LazyCorpusLoader(
+    "bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
+)
 brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
     "brown",
     CategorizedTaggedCorpusReader,

diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
@@ -105,6 +105,7 @@
 from nltk.corpus.reader.comparative_sents import *
 from nltk.corpus.reader.panlex_lite import *
 from nltk.corpus.reader.panlex_swadesh import *
+from nltk.corpus.reader.bcp47 import *
 
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -181,4 +182,5 @@
     "UnicharsCorpusReader",
     "MWAPPDBCorpusReader",
     "PanlexSwadeshCorpusReader",
+    "BCP47CorpusReader",
 ]
diff --git a/nltk/corpus/reader/bcp47.py b/nltk/corpus/reader/bcp47.py
@@ -0,0 +1,218 @@
+# Natural Language Toolkit: BCP-47 language tags
+#
+# Copyright (C) 2022 NLTK Project
+# Author: Eric Kafe <kafe.eric@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+from warnings import warn
+from xml.etree import ElementTree as et
+
+from nltk.corpus.reader import CorpusReader
+
+
+class BCP47CorpusReader(CorpusReader):
+    """
+    Parse BCP-47 composite language tags
+
+    Supports all the main subtags, and the 'u-sd' extension:
+
+    >>> from nltk.corpus import bcp47
+    >>> bcp47.name('oc-gascon-u-sd-fr64')
+    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
+
+    Can load a conversion table to Wikidata Q-codes:
+    >>> bcp47.load_wiki_q()
+    >>> bcp47.wiki_q['en-GI-spanglis']
+    'Q79388'
+
+    """
+
+    def __init__(self, root, fileids):
+        """Read the BCP-47 database"""
+        super().__init__(root, fileids)
+        self.langcode = {}
+        with self.open("iana/language-subtag-registry.txt") as fp:
+            self.db = self.data_dict(fp.read().split("%%\n"))
+        with self.open("cldr/common-subdivisions-en.xml") as fp:
+            self.subdiv = self.subdiv_dict(
+                et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
+            )
+        self.morphology()
+
+    def load_wiki_q(self):
+        """Load conversion table to Wikidata Q-codes (only if needed)"""
+        with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
+            self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
+
+    def wiki_dict(self, lines):
+        """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
+        return {
+            pair[1]: pair[0].split("/")[-1]
+            for pair in [line.strip().split("\t") for line in lines]
+        }
+
+    def subdiv_dict(self, subdivs):
+        """Convert the CLDR subdivisions list to a dictionary"""
+        return {sub.attrib["type"]: sub.text for sub in subdivs}
+
+    def morphology(self):
+        self.casing = {
+            "language": str.lower,
+            "extlang": str.lower,
+            "script": str.title,
+            "region": str.upper,
+            "variant": str.lower,
+        }
+        dig = "[0-9]"
+        low = "[a-z]"
+        up = "[A-Z]"
+        alnum = "[a-zA-Z0-9]"
+        self.format = {
+            "language": re.compile(f"{low*3}?"),
+            "extlang": re.compile(f"{low*3}"),
+            "script": re.compile(f"{up}{low*3}"),
+            "region": re.compile(f"({up*2})|({dig*3})"),
+            "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
+            "singleton": re.compile(f"{low}"),
+        }
+
+    def data_dict(self, records):
+        """Convert the BCP-47 language subtag registry to a dictionary"""
+        self.version = records[0].replace("File-Date:", "").strip()
+        dic = {}
+        dic["deprecated"] = {}
+        for label in [
+            "language",
+            "extlang",
+            "script",
+            "region",
+            "variant",
+            "redundant",
+            "grandfathered",
+        ]:
+            dic["deprecated"][label] = {}
+        for record in records[1:]:
+            fields = [field.split(": ") for field in record.strip().split("\n")]
+            typ = fields[0][1]
+            tag = fields[1][1]
+            if typ not in dic:
+                dic[typ] = {}
+            subfields = {}
+            for field in fields[2:]:
+                if len(field) == 2:
+                    [key, val] = field
+                    if key not in subfields:
+                        subfields[key] = [val]
+                    else:  # multiple value
+                        subfields[key].append(val)
+                else:  # multiline field
+                    subfields[key][-1] += " " + field[0].strip()
+                if (
+                    "Deprecated" not in record
+                    and typ == "language"
+                    and key == "Description"
+                ):
+                    self.langcode[subfields[key][-1]] = tag
+            for key in subfields:
+                if len(subfields[key]) == 1:  # single value
+                    subfields[key] = subfields[key][0]
+            if "Deprecated" in record:
+                dic["deprecated"][typ][tag] = subfields
+            else:
+                dic[typ][tag] = subfields
+        return dic
+
+    def val2str(self, val):
+        """Return only first value"""
+        if type(val) == list:
+            #            val = "/".join(val) # Concatenate all values
+            val = val[0]
+        return val
+
+    def lang2str(self, lg_record):
+        """Concatenate subtag values"""
+        name = f"{lg_record['language']}"
+        for label in ["extlang", "script", "region", "variant", "extension"]:
+            if label in lg_record:
+                name += f": {lg_record[label]}"
+        return name
+
+    def parse_tag(self, tag):
+        """Convert a BCP-47 tag to a dictionary of labelled subtags"""
+        subtags = tag.split("-")
+        lang = {}
+        labels = ["language", "extlang", "script", "region", "variant", "variant"]
+        while subtags and labels:
+            subtag = subtags.pop(0)
+            found = False
+            while labels:
+                label = labels.pop(0)
+                subtag = self.casing[label](subtag)
+                if self.format[label].fullmatch(subtag):
+                    if subtag in self.db[label]:
+                        found = True
+                        valstr = self.val2str(self.db[label][subtag]["Description"])
+                        if label == "variant" and label in lang:
+                            lang[label] += ": " + valstr
+                        else:
+                            lang[label] = valstr
+                        break
+                    elif subtag in self.db["deprecated"][label]:
+                        found = True
+                        note = f"The {subtag!r} {label} code is deprecated"
+                        if "Preferred-Value" in self.db["deprecated"][label][subtag]:
+                            prefer = self.db["deprecated"][label][subtag][
+                                "Preferred-Value"
+                            ]
+                            note += f"', prefer '{self.val2str(prefer)}'"
+                        lang[label] = self.val2str(
+                            self.db["deprecated"][label][subtag]["Description"]
+                        )
+                        warn(note)
+                        break
+            if not found:
+                if subtag == "u" and subtags[0] == "sd":  # CLDR regional subdivisions
+                    sd = subtags[1]
+                    if sd in self.subdiv:
+                        ext = self.subdiv[sd]
+                    else:
+                        ext = f"<Unknown subdivision: {ext}>"
+                else:  # other extension subtags are not supported yet
+                    ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
+                    if not self.format["singleton"].fullmatch(subtag):
+                        ext = f"<Invalid extension: {ext}>"
+                        warn(ext)
+                lang["extension"] = ext
+                subtags = []
+        return lang
+
+    def name(self, tag):
+        """
+        Convert a BCP-47 tag to a colon-separated string of subtag names
+
+        >>> from nltk.corpus import bcp47
+        >>> bcp47.name('ca-Latn-ES-valencia')
+        'Catalan: Latin: Spain: Valencian'
+
+        """
+        for label in ["redundant", "grandfathered"]:
+            val = None
+            if tag in self.db[label]:
+                val = f"{self.db[label][tag]['Description']}"
+                note = f"The {tag!r} code is {label}"
+            elif tag in self.db["deprecated"][label]:
+                val = f"{self.db['deprecated'][label][tag]['Description']}"
+                note = f"The {tag!r} code is {label} and deprecated"
+                if "Preferred-Value" in self.db["deprecated"][label][tag]:
+                    prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
+                    note += f", prefer {self.val2str(prefer)!r}"
+            if val:
+                warn(note)
+                return val
+        try:
+            return self.lang2str(self.parse_tag(tag))
+        except:
+            warn(f"Tag {tag!r} was not recognized")
+            return None