From 6cb0d5ecdea2f9592959aad7cbcdd35f12cb5112 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 28 Jul 2023 12:05:19 +0200 Subject: [PATCH] Generate alternative typos with a translation table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way we can catch misspellings with alternative characters, typically and acute accent (´) instead of an apostrophe ('). In this case, the alternative character is a valid character and will be used both in the misspelling and the fix(es). The above is different from detecting Unicode phishing, where some characters like `A` are intentionally, or not, replaced by lookalikes such as `A`, `Α`, `А`, `ᗅ`, `ᴀ`, `A`. In that case, the alternative character is invalid and should be replaced by its valid counterpart in the fix. We do not address that here. --- codespell_lib/_codespell.py | 52 ++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 19999cdbd6e..662d503d501 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -36,6 +36,10 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) +# Pass all misspellings through this translation table to catch +# alternative misspellings. +alt_chars = (("'", "’"),) +_alt_char_trans_tables = (str.maketrans(x, y) for x, y in alt_chars) encodings = ("utf-8", "iso-8859-1") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -622,6 +626,29 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None: ignore_words.add(line.strip()) +def add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + fix = data.rfind(",") + + if fix < 0: + fix = True + reason = "" + elif fix == (len(data) - 1): + data = data[:fix] + reason = "" + fix = False + else: + reason = data[fix + 1 :].strip() + data = data[:fix] + fix = False + + misspellings[key] = Misspelling(data, fix, reason) + + def build_dict( filename: str, misspellings: Dict[str, Misspelling], @@ -634,24 +661,13 @@ def build_dict( # support for fixing caps. key = key.lower() data = data.lower() - if key in ignore_words: - continue - data = data.strip() - fix = data.rfind(",") - - if fix < 0: - fix = True - reason = "" - elif fix == (len(data) - 1): - data = data[:fix] - reason = "" - fix = False - else: - reason = data[fix + 1 :].strip() - data = data[:fix] - fix = False - - misspellings[key] = Misspelling(data, fix, reason) + if key not in ignore_words: + add_misspelling(key, data, misspellings) + for table in _alt_char_trans_tables: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + add_misspelling(alt_key, alt_data, misspellings) def is_hidden(filename: str, check_hidden: bool) -> bool: