Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🎨 Enable strict type check and improve the project typing #207

Merged
merged 3 commits into from
Aug 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
python setup.py install
- name: Type checking (Mypy)
run: |
mypy charset_normalizer
mypy --strict charset_normalizer
- name: Import sorting check (isort)
run: |
isort --check charset_normalizer
Expand Down
22 changes: 11 additions & 11 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from os import PathLike
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set
from typing import Any, BinaryIO, List, Optional, Set

from .cd import (
coherence_ratio,
Expand Down Expand Up @@ -36,8 +36,8 @@ def from_bytes(
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
Expand Down Expand Up @@ -486,8 +486,8 @@ def from_fp(
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
Expand All @@ -508,12 +508,12 @@ def from_fp(


def from_path(
path: PathLike,
path: "PathLike[Any]",
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
Expand All @@ -535,12 +535,12 @@ def from_path(


def normalize(
path: PathLike,
path: "PathLike[Any]",
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
) -> CharsetMatch:
"""
Expand Down
8 changes: 5 additions & 3 deletions charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Dict, List, Optional, Tuple
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
Expand All @@ -24,7 +24,9 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")

decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder

p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
Expand Down Expand Up @@ -307,7 +309,7 @@ def coherence_ratio(
lg_inclusion_list.remove("Latin Based")

for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: Counter = Counter(layer)
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()

character_count: int = sum(o for c, o in most_common)
Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/cli/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from json import dumps
from os.path import abspath
from platform import python_version
from typing import List
from typing import List, Optional

try:
from unicodedata2 import unidata_version
Expand Down Expand Up @@ -48,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")


def cli_detect(argv: List[str] = None) -> int:
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
Expand Down
15 changes: 12 additions & 3 deletions charset_normalizer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from typing import (
Any,
Counter as TypeCounter,
Dict,
Iterator,
List,
Optional,
Tuple,
Union,
)

from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio
Expand Down Expand Up @@ -95,7 +104,7 @@ def coherence_non_latin(self) -> float:
return 0.0

@property
def w_counter(self) -> Counter:
def w_counter(self) -> TypeCounter[str]:
"""
Word counter instance on decoded text.
Notice: Will be removed in 3.0
Expand Down Expand Up @@ -280,7 +289,7 @@ class CharsetMatches:
Act like a list(iterable) but does not implements all related methods.
"""

def __init__(self, results: List[CharsetMatch] = None):
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []

def __iter__(self) -> Iterator[CharsetMatch]:
Expand Down
18 changes: 14 additions & 4 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union

from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
from _multibytecodec import MultibyteIncrementalDecoder

from .constant import (
ENCODING_MARKS,
Expand Down Expand Up @@ -231,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")

encoding_alias: str
encoding_iana: str

for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
Expand All @@ -256,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder,
)

Expand Down Expand Up @@ -286,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")

encoding_alias: str
encoding_iana: str

for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
Expand Down Expand Up @@ -315,8 +321,12 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0

decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder

id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
Expand Down