From 555a752c2f216db7f8c851bf2dd0831c1d2ee3a8 Mon Sep 17 00:00:00 2001
From: Dimitri Papadopoulos
 <3234522+DimitriPapadopoulos@users.noreply.github.com>
Date: Fri, 28 Jul 2023 12:05:19 +0200
Subject: [PATCH] Generate alternative typos with a translation table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This way we can catch misspellings with alternative characters,
typically typographic apostrophe or acute accent U+2019 (´)
instead of typewriter apostrophe U+0027 ('). In this case,
the alternative character is a valid character and will be
used both in the misspelling and the fix(es).

The above is different from detecting Unicode phishing, where
some characters like `A` are intentionally, or not, replaced
by lookalikes such as `A`, `Α`,  `А`,  `ᗅ`, `ᴀ`,  `A`.
In that case, the alternative character is invalid and should
be replaced by its valid counterpart in the fix. We do not
address that case here.
---
 codespell_lib/_codespell.py | 43 ++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 800e12c9947..b24c482bbde 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -36,6 +36,9 @@
     "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
     "\\b[\\w.%+-]+@[\\w.-]+\\b)"
 )
+# Pass all misspellings through this translation table to catch
+# alternative misspellings.
+alt_chars = (("'", "’"),)
 encodings = ("utf-8", "iso-8859-1")
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -622,31 +625,45 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None:
             ignore_words.add(line.strip())
 
 
+def add_misspelling(
+    key: str,
+    data: str,
+    misspellings: Dict[str, Misspelling],
+) -> None:
+    data = data.strip()
+
+    if "," in data:
+        fix = False
+        data, reason = data.rsplit(",", 1)
+        reason = reason.lstrip()
+    else:
+        fix = True
+        reason = ""
+
+    misspellings[key] = Misspelling(data, fix, reason)
+
+
 def build_dict(
     filename: str,
     misspellings: Dict[str, Misspelling],
     ignore_words: Set[str],
 ) -> None:
     with open(filename, encoding="utf-8") as f:
+        translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
         for line in f:
             [key, data] = line.split("->")
             # TODO for now, convert both to lower. Someday we can maybe add
             # support for fixing caps.
             key = key.lower()
             data = data.lower()
-            if key in ignore_words:
-                continue
-            data = data.strip()
-
-            if "," in data:
-                fix = False
-                data, reason = data.rsplit(",", 1)
-                reason = reason.lstrip()
-            else:
-                fix = True
-                reason = ""
-
-            misspellings[key] = Misspelling(data, fix, reason)
+            if key not in ignore_words:
+                add_misspelling(key, data, misspellings)
+            for x, table in translate_tables:
+                if x in key:
+                    alt_key = key.translate(table)
+                    alt_data = data.translate(table)
+                    if alt_key not in ignore_words:
+                        add_misspelling(alt_key, alt_data, misspellings)
 
 
 def is_hidden(filename: str, check_hidden: bool) -> bool: