Replace unidecode by unicodedata, pinyin, romkan

Potential licence issue
sbrunner · Jul 5, 2022 · 6d0736f · 6d0736f
1 parent da41b35
commit 6d0736f
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 99 deletions.
diff --git a/Makefile b/Makefile
@@ -10,7 +10,7 @@ help: ## Display this help message
 
 .poetry.timestamps: pyproject.toml poetry.lock
 	poetry --version || pip install --user --requirement=requirements.txt
-	poetry install --extras=tools
+	poetry install --extras=tools --extras=generate
 	touch $@
 
 .PHONY: prospector

diff --git a/ci/config.yaml b/ci/config.yaml
@@ -3,6 +3,11 @@
 checks:
   versions:
     rebuild: False
+  codespell:
+    arguments:
+      - --quiet-level=2
+      - --check-filenames
+      - --ignore-words-list=nd
 
 publish:
   docker:

diff --git a/jsonschema_gentypes/__init__.py b/jsonschema_gentypes/__init__.py
@@ -3,19 +3,104 @@
 """
 
 import textwrap
+import unicodedata
 from abc import abstractmethod
 from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union, cast
 
-import yaml
 from jsonschema import RefResolver
-from unidecode import unidecode
 
 from jsonschema_gentypes import configuration, jsonschema
 
 # Raise issues here.
 ISSUE_URL = "https://github.com/camptcamp/jsonschema-gentypes"
 
 
+def __convert_char(char: str) -> str:
+    import pinyin  # pylint: disable=import-outside-toplevel
+    import romkan  # pylint: disable=import-outside-toplevel
+
+    # Remove accents
+    if unicodedata.combining(char):
+        return ""
+    if char == "-":
+        return " "
+    category = unicodedata.category(char)
+    # All spaced => space
+    if category in ("Zs", "Cc"):
+        return " "
+    # Accept letter and number
+    if category in ("Nd", "Ll", "Lu"):
+        return char
+    # Explicit sign
+    if category in ("So", "Po"):
+        name = unicodedata.name(char)
+        if category == "So":
+            name = name.replace(" SIGN", "")
+        return f" {name} "
+    # Other characters
+    # Chinese characters
+    pinyin_char = pinyin.get(char, delimiter=" ")
+    return (
+        # Japanese characters
+        cast(str, romkan.to_roma(char))
+        if len(pinyin_char) == 1
+        else "".join(
+            [c for c in unicodedata.normalize("NFKD", f" {pinyin_char} ") if not unicodedata.combining(c)]
+        )
+    )
+
+
+def normalize(input_str: str) -> str:
+    """Normalize the string to be a Python name."""
+
+    # Unaccent, ...
+    nfkd_form = unicodedata.normalize("NFKD", input_str)
+    name = "".join([__convert_char(c) for c in nfkd_form])
+
+    # No number at first position
+    if name[0] in list(char_range("0", "9")):
+        name = f"num {name}"
+
+    # No python keyword
+    if name.lower() in [
+        "and",
+        "as",
+        "assert",
+        "break",
+        "class",
+        "continue",
+        "def",
+        "del",
+        "elif",
+        "else",
+        "except",
+        "false",
+        "finally",
+        "for",
+        "from",
+        "global",
+        "if",
+        "import",
+        "in",
+        "is",
+        "lambda",
+        "none",
+        "nonlocal",
+        "not",
+        "or",
+        "pass",
+        "raise",
+        "return",
+        "true",
+        "try",
+        "while",
+        "with",
+        "yield",
+    ]:
+        name = f"{name} name"
+    return name
+
+
 class Type:
     """
     The base Type object.
@@ -94,7 +179,7 @@ def set_name(self, name: str) -> None:
 
     def unescape_name(self) -> str:
         """
-        Return the unscaped name.
+        Return the unescaped name.
         """
         return self._name
 
@@ -158,7 +243,7 @@ def name(self) -> str:
 
 class NativeType(Type):
     """
-    Native Type that will essencially generates a Python import.
+    Native Type that will essentially generates a Python import.
     """
 
     def __init__(self, name: str, package: str = "typing") -> None:
@@ -229,7 +314,7 @@ def __init__(self, name: str, sub_type: Type, descriptions: Optional[List[str]]
 
         Arguments:
             name: the type name
-            sub_type: the type that should be aliazed
+            sub_type: the type that should be aliased
             descriptions: the type description
         """
         super().__init__(name)
@@ -292,8 +377,8 @@ def definition(self, line_length: Optional[int] = None) -> List[str]:
         result += ["# The values for the enum"]
         for value in self.values:
             name = get_name({"title": f"{self._name} {value}"}, upper=True)
-            formated_value = f'"{value}"' if isinstance(value, str) else str(value)
-            result.append(f"{name}: {LiteralType(value).name()} = {formated_value}")
+            formatted_value = f'"{value}"' if isinstance(value, str) else str(value)
+            result.append(f"{name}: {LiteralType(value).name()} = {formatted_value}")
         return result
 
 
@@ -388,58 +473,15 @@ def get_name(
 
     Arguments:
         schema: the concerned schema
-        proposed_name: a name that we will use it the scheema hasn't any title
-        upper: should we use an upper cass (For constants)
+        proposed_name: a name that we will use it the schema hasn't any title
+        upper: should we use an upper case (For constants)
     """
     # Get the base name
     has_title = isinstance(schema, dict) and "title" in schema
     name = schema["title"] if has_title else proposed_name  # type: ignore
     assert name is not None
-    # Unaccent, ...
-    name = unidecode(name)
-    # Remove unauthorised char
-    authorised_char = list(char_range("a", "z")) + list(char_range("A", "Z")) + list(char_range("0", "9"))
-    name = "".join([(c if c in authorised_char else " ") for c in name])
-    # No number at first position
-    if name[0] in list(char_range("0", "9")):
-        name = f"num {name}"
-    # No python keyword
-    if name.lower() in [
-        "and",
-        "as",
-        "assert",
-        "break",
-        "class",
-        "continue",
-        "def",
-        "del",
-        "elif",
-        "else",
-        "except",
-        "false",
-        "finally",
-        "for",
-        "from",
-        "global",
-        "if",
-        "import",
-        "in",
-        "is",
-        "lambda",
-        "none",
-        "nonlocal",
-        "not",
-        "or",
-        "pass",
-        "raise",
-        "return",
-        "true",
-        "try",
-        "while",
-        "with",
-        "yield",
-    ]:
-        name = f"{name} name"
+    name = normalize(name)
+
     prefix = "" if has_title else "_"
     if upper:
         # Upper case
@@ -460,6 +502,8 @@ def get_description(schema: jsonschema.JSONSchemaItem) -> List[str]:
     Arguments:
         schema: the concerned schema
     """
+    import yaml  # pylint: disable=import-outside-toplevel
+
     result: List[str] = []
     for key in ("title", "description"):
         if key in schema:
@@ -594,13 +638,13 @@ def _get_type_internal(self, schema: jsonschema.JSONSchemaItem, proposed_name: s
             then_schema.update(self._resolve_ref(cast(jsonschema.JSONSchemaItem, schema.get("then", {}))))
             if "properties" not in then_schema:
                 then_schema["properties"] = {}
-            then_propoerties = then_schema["properties"]
-            assert then_propoerties
+            then_properties = then_schema["properties"]
+            assert then_properties
             if_properties = self._resolve_ref(cast(jsonschema.JSONSchemaItem, schema.get("if", {}))).get(
                 "properties", {}
             )
             assert if_properties
-            then_propoerties.update(if_properties)
+            then_properties.update(if_properties)
             else_schema: jsonschema.JSONSchemaItem = {}
             else_schema.update(base_schema)
             else_schema.update(self._resolve_ref(cast(jsonschema.JSONSchemaItem, schema.get("else", {}))))
@@ -676,7 +720,7 @@ def _get_type_internal(self, schema: jsonschema.JSONSchemaItem, proposed_name: s
 
         if schema_type is None:
             type_ = BuiltinType("None")
-            type_.set_comments(["WARNING: we get an scheam without any type"])
+            type_.set_comments(["WARNING: we get an schema without any type"])
             return type_
         assert isinstance(schema_type, str), (
             f"Expected to find a supported schema type, got {schema_type}" f"\nDuring parsing of {schema}"
@@ -722,7 +766,7 @@ def ref(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
     def any_of(
         self,
         schema: jsonschema.JSONSchemaItem,
-        subschema: List[jsonschema.JSONSchemaItem],
+        sub_schema: List[jsonschema.JSONSchemaItem],
         proposed_name: str,
         sub_name: str,
     ) -> Type:
@@ -779,9 +823,7 @@ def enum(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
             get_description(schema),
         )
 
-    def boolean(
-        self, schema: jsonschema.JSONSchemaItem, proposed_name: str
-    ) -> Type:
+    def boolean(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
         """
         Generate a ``bool`` annotation for a boolean object.
         """
@@ -819,9 +861,9 @@ def add_required(type_: Type, prop: str, required: Set[str]) -> Type:
 
             struct = {
                 prop: add_required(
-                    self.get_type(subschema, proposed_name + " " + prop, auto_alias=False), prop, required
+                    self.get_type(sub_schema, proposed_name + " " + prop, auto_alias=False), prop, required
                 )
-                for prop, subschema in properties.items()
+                for prop, sub_schema in properties.items()
             }
 
             type_: Type = TypedDictType(
@@ -884,7 +926,7 @@ def array(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
     def any_of(
         self,
         schema: jsonschema.JSONSchemaItem,
-        subschema: List[jsonschema.JSONSchemaItem],
+        sub_schema: List[jsonschema.JSONSchemaItem],
         proposed_name: str,
         sub_name: str,
     ) -> Type:
@@ -896,7 +938,7 @@ def any_of(
                 lambda o: o is not None,
                 [
                     self.get_type(subs, f"{proposed_name} {sub_name}{index}")
-                    for index, subs in enumerate(subschema)
+                    for index, subs in enumerate(sub_schema)
                 ],
             )
         )
@@ -951,36 +993,28 @@ def ref(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
             self.ref_type[ref] = type_
         return type_
 
-    def string(
-        self, schema: jsonschema.JSONSchemaItem, proposed_name: str
-    ) -> Type:
+    def string(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
         """
         Generate a ``str`` annotation.
         """
         del schema, proposed_name
         return BuiltinType("str")
 
-    def number(
-        self, schema: jsonschema.JSONSchemaItem, proposed_name: str
-    ) -> Type:
+    def number(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
         """
         Generate a ``Union[int, float]`` annotation.
         """
         del schema, proposed_name
         return CombinedType(NativeType("Union"), [BuiltinType("int"), BuiltinType("float")])
 
-    def integer(
-        self, schema: jsonschema.JSONSchemaItem, proposed_name: str
-    ) -> Type:
+    def integer(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
         """
         Generate an ``int`` annotation.
         """
         del schema, proposed_name
         return BuiltinType("int")
 
-    def null(
-        self, schema: jsonschema.JSONSchemaItem, proposed_name: str
-    ) -> Type:
+    def null(self, schema: jsonschema.JSONSchemaItem, proposed_name: str) -> Type:
         """
         Generate an ``None`` annotation.
         """