diff --git a/.github/workflows/chardet-bc.yml b/.github/workflows/chardet-bc.yml
index 0bbeaec8..dfbc64cc 100644
--- a/.github/workflows/chardet-bc.yml
+++ b/.github/workflows/chardet-bc.yml
@@ -25,7 +25,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
deleted file mode 100644
index 1a7014d5..00000000
--- a/.github/workflows/codeql-analysis.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-name: "CodeQL"
-
-on:
- push:
- branches: [master, develop]
- pull_request:
- # The branches below must be a subset of the branches above
- branches: [master, develop]
- schedule:
- - cron: '0 23 * * 0'
-
-jobs:
- analyze:
- name: Analyze
- runs-on: ubuntu-latest
-
- strategy:
- matrix:
- python-version: [3.9]
- fail-fast: false
-
- steps:
- - name: Checkout repository
- uses: actions/checkout@v2
- with:
- # We must fetch at least the immediate parents so that if this is
- # a pull request then we can checkout the head.
- fetch-depth: 2
-
- # If this run was triggered by a pull request event, then checkout
- # the head of the pull request instead of the merge commit.
- - run: git checkout HEAD^2
- if: ${{ github.event_name == 'pull_request' }}
-
- # Initializes the CodeQL tools for scanning.
- - name: Initialize CodeQL
- uses: github/codeql-action/init@v1
- with:
- languages: "python"
- # If you wish to specify custom queries, you can do so here or in a config file.
- # By default, queries listed here will override any specified in a config file.
- # Prefix the list here with "+" to use these queries and those in the config file.
- # queries: ./path/to/local/query, your-org/your-repo/queries@main
-
- # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
- # If this step fails, then you should remove it and run the build manually (see below)
- - name: Autobuild
- uses: github/codeql-action/autobuild@v1
-
- - name: Perform CodeQL Analysis
- uses: github/codeql-action/analyze@v1
diff --git a/.github/workflows/detector-coverage.yml b/.github/workflows/detector-coverage.yml
index 19eed9ae..1527f22b 100644
--- a/.github/workflows/detector-coverage.yml
+++ b/.github/workflows/detector-coverage.yml
@@ -25,7 +25,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index f74a56d2..00aa98eb 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -28,7 +28,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 877b890e..4f1f12f4 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -25,7 +25,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Type checking (Mypy)
run: |
mypy --strict charset_normalizer
diff --git a/.github/workflows/mypyc-verify.yml b/.github/workflows/mypyc-verify.yml
new file mode 100644
index 00000000..e9b2a9c7
--- /dev/null
+++ b/.github/workflows/mypyc-verify.yml
@@ -0,0 +1,40 @@
+name: MYPYC Run
+
+on: [push, pull_request]
+
+jobs:
+ detection_coverage:
+ runs-on: ${{ matrix.os }}
+
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
+ os: [ubuntu-latest]
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ pip install -U pip setuptools
+ pip install -r dev-requirements.txt
+ pip uninstall -y charset-normalizer
+ - name: Install the package
+ env:
+ CHARSET_NORMALIZER_USE_MYPYC: '1'
+ run: |
+ python -m build --no-isolation
+ pip install ./dist/*.whl
+ - name: Clone the complete dataset
+ run: |
+ git clone https://github.com/Ousret/char-dataset.git
+ - name: Coverage WITH preemptive
+ run: |
+ python ./bin/coverage.py --coverage 97 --with-preemptive
+ - name: Coverage WITHOUT preemptive
+ run: |
+ python ./bin/coverage.py --coverage 95
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
index fddd9d30..e675061a 100644
--- a/.github/workflows/performance.yml
+++ b/.github/workflows/performance.yml
@@ -25,7 +25,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index d9e664c1..2042d90e 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -29,7 +29,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Type checking (Mypy)
run: |
mypy charset_normalizer
@@ -51,7 +52,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10" ]
+ python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10", "3.11-dev" ]
os: [ ubuntu-latest ]
steps:
@@ -67,7 +68,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Run tests
run: |
pytest
@@ -96,7 +98,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
@@ -136,7 +139,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build
+ pip install ./dist/*.whl
- name: Clone the complete dataset
run: |
git clone https://github.com/Ousret/char-dataset.git
@@ -146,11 +150,92 @@ jobs:
- name: Integration Tests with Requests
run: |
python ./bin/integration.py
+ universal-wheel:
+ runs-on: ubuntu-latest
+ needs:
+ - integration
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.x'
+ - name: Update pip, setuptools, wheel and twine
+ run: |
+ python -m pip install --upgrade pip
+ pip install setuptools wheel twine
+ - name: Build Wheel
+ env:
+ CHARSET_NORMALIZER_USE_MYPYC: '0'
+ run: python -m build
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v3
+ with:
+ name: dist
+ path: dist
+
+ build-wheels:
+ name: Build wheels on ${{ matrix.os }} ${{ matrix.qemu }}
+ runs-on: ${{ matrix.os }}-latest
+ needs: universal-wheel
+ strategy:
+ matrix:
+ os: [ ubuntu, windows, macos ]
+ qemu: [ '' ]
+ include:
+ # Split ubuntu job for the sake of speed-up
+ - os: ubuntu
+ qemu: aarch64
+ - os: ubuntu
+ qemu: ppc64le
+ - os: ubuntu
+ qemu: s390x
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: true
+ - name: Set up QEMU
+ if: ${{ matrix.qemu }}
+ uses: docker/setup-qemu-action@v2
+ with:
+ platforms: all
+ id: qemu
+ - name: Prepare emulation
+ run: |
+ if [[ -n "${{ matrix.qemu }}" ]]; then
+ # Build emulated architectures only if QEMU is set,
+ # use default "auto" otherwise
+ echo "CIBW_ARCHS_LINUX=${{ matrix.qemu }}" >> $GITHUB_ENV
+ fi
+ shell: bash
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ - name: Update pip, wheel, setuptools, build, twine
+ run: |
+ python -m pip install -U pip wheel setuptools build twine
+ - name: Build wheels
+ uses: pypa/cibuildwheel@2.10.2
+ env:
+ CIBW_BUILD_FRONTEND: "build"
+ CIBW_ARCHS_MACOS: x86_64 arm64 universal2
+ CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1'
+ CIBW_CONFIG_SETTINGS: "--no-isolation"
+ CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt
+ CIBW_TEST_REQUIRES: pytest codecov pytest-cov
+ CIBW_TEST_COMMAND: pytest {package}/tests
+ CIBW_SKIP: pp*
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v3
+ with:
+ name: dist
+ path: ./wheelhouse/*.whl
+
deploy:
runs-on: ubuntu-latest
needs:
- - integration
+ - build-wheels
steps:
- uses: actions/checkout@v2
@@ -162,10 +247,17 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine
- - name: Build and publish
+ - name: Download disctributions
+ uses: actions/download-artifact@v3
+ with:
+ name: dist
+ path: dist
+ - name: Collected dists
+ run: |
+ tree dist
+ - name: Publish
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
- python setup.py sdist bdist_wheel
twine upload dist/*
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 2e999729..27dc5d5f 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -25,7 +25,8 @@ jobs:
pip uninstall -y charset-normalizer
- name: Install the package
run: |
- python setup.py install
+ python -m build --no-isolation
+ pip install ./dist/*.whl
- name: Run tests
run: |
pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b80e7cd1..dcfd8f76 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,48 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
+
+### Added
+- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
+- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
+- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
+
+### Changed
+- Build with static metadata using 'build' frontend
+- Make the language detection stricter
+
+### Fixed
+- CLI with opt --normalize fail when using full path for files
+- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
+
+### Removed
+- Coherence detector no longer return 'Simple English' instead return 'English'
+- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
+
+## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
+
+### Added
+- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
+
+### Removed
+- Breaking: Method `first()` and `best()` from CharsetMatch
+- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
+
+### Fixed
+- Sphinx warnings when generating the documentation
+
+## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
+
+### Changed
+- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
+
+### Removed
+- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
+- Breaking: Top-level function `normalize`
+- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
+- Support for the backport `unicodedata2`
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
diff --git a/README.md b/README.md
index d58ede1b..27736830 100644
--- a/README.md
+++ b/README.md
@@ -25,16 +25,16 @@ This project offers you an alternative to **Universal Charset Encoding Detector*
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
| ------------- | :-------------: | :------------------: | :------------------: |
-| `Fast` | ❌
| :heavy_check_mark:
| :heavy_check_mark:
|
-| `Universal**` | ❌ | :heavy_check_mark: | ❌ |
-| `Reliable` **without** distinguishable standards | ❌ | :heavy_check_mark: | :heavy_check_mark: |
-| `Reliable` **with** distinguishable standards | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+| `Fast` | ❌
| ✅
| ✅
|
+| `Universal**` | ❌ | ✅ | ❌ |
+| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
+| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1
_restrictive_ | MIT | MPL-1.1
_restrictive_ |
-| `Native Python` | :heavy_check_mark: | :heavy_check_mark: | ❌ |
-| `Detect spoken language` | ❌ | :heavy_check_mark: | N/A |
-| `UnicodeDecodeError Safety` | ❌ | :heavy_check_mark: | ❌ |
+| `Native Python` | ✅ | ✅ | ❌ |
+| `Detect spoken language` | ❌ | ✅ | N/A |
+| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB |
-| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
+| `Supported Encoding` | 33 | :tada: [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40
@@ -53,12 +53,12 @@ This package offer better performance than its counterpart Chardet. Here are som
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
-| charset-normalizer | **98 %** | **39 ms** | 26 file/sec |
+| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
-| charset-normalizer | 400 ms | 200 ms | 15 ms |
+| charset-normalizer | 100 ms | 50 ms | 5 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
@@ -68,9 +68,6 @@ Chardet's performance on larger file (1MB+) are very poor. Expect huge differenc
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (eg. Supported Encoding) Challenge-them if you want.
-[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with
-a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it.
-
## ✨ Installation
Using PyPi for latest stable
@@ -78,11 +75,6 @@ Using PyPi for latest stable
pip install charset-normalizer -U
```
-If you want a more up-to-date `unicodedata` than the one available in your Python setup.
-```sh
-pip install charset-normalizer[unicode_backport] -U
-```
-
## 🚀 Basic Usage
### CLI
diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh
index f853cacd..e88f45c6 100755
--- a/bin/run_autofix.sh
+++ b/bin/run_autofix.sh
@@ -7,5 +7,5 @@ fi
set -x
-${PREFIX}black --target-version=py35 charset_normalizer
+${PREFIX}black --target-version=py36 charset_normalizer
${PREFIX}isort charset_normalizer
diff --git a/bin/run_checks.sh b/bin/run_checks.sh
index 0ae730eb..1e135b35 100755
--- a/bin/run_checks.sh
+++ b/bin/run_checks.sh
@@ -8,7 +8,7 @@ fi
set -x
${PREFIX}pytest
-${PREFIX}black --check --diff --target-version=py35 charset_normalizer
+${PREFIX}black --check --diff --target-version=py36 charset_normalizer
${PREFIX}flake8 charset_normalizer
${PREFIX}mypy charset_normalizer
${PREFIX}isort --check --diff charset_normalizer
diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
index 2dcaf56f..ebb5da89 100644
--- a/charset_normalizer/__init__.py
+++ b/charset_normalizer/__init__.py
@@ -21,14 +21,8 @@
"""
import logging
-from .api import from_bytes, from_fp, from_path, normalize
-from .legacy import (
- CharsetDetector,
- CharsetDoctor,
- CharsetNormalizerMatch,
- CharsetNormalizerMatches,
- detect,
-)
+from .api import from_bytes, from_fp, from_path
+from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
@@ -37,14 +31,9 @@
"from_fp",
"from_path",
"from_bytes",
- "normalize",
"detect",
"CharsetMatch",
"CharsetMatches",
- "CharsetNormalizerMatch",
- "CharsetNormalizerMatches",
- "CharsetDetector",
- "CharsetDoctor",
"__version__",
"VERSION",
"set_logging_handler",
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
index b6c37e8b..6c7e8983 100644
--- a/charset_normalizer/api.py
+++ b/charset_normalizer/api.py
@@ -1,7 +1,5 @@
import logging
-import warnings
from os import PathLike
-from os.path import basename, splitext
from typing import Any, BinaryIO, List, Optional, Set
from .cd import (
@@ -41,6 +39,7 @@ def from_bytes(
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
+ language_threshold: float = 0.1,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -201,6 +200,13 @@ def from_bytes(
encoding_iana,
)
continue
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
+ logger.log(
+ TRACE,
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
+ encoding_iana,
+ )
+ continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
@@ -297,7 +303,13 @@ def from_bytes(
):
md_chunks.append(chunk)
- md_ratios.append(mess_ratio(chunk, threshold))
+ md_ratios.append(
+ mess_ratio(
+ chunk,
+ threshold,
+ explain is True and 1 <= len(cp_isolation) <= 2,
+ )
+ )
if md_ratios[-1] >= threshold:
early_stop_count += 1
@@ -389,7 +401,9 @@ def from_bytes(
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
- chunk, 0.1, ",".join(target_languages) if target_languages else None
+ chunk,
+ language_threshold,
+ ",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
@@ -491,6 +505,7 @@ def from_fp(
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
+ language_threshold: float = 0.1,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -505,6 +520,7 @@ def from_fp(
cp_exclusion,
preemptive_behaviour,
explain,
+ language_threshold,
)
@@ -517,6 +533,7 @@ def from_path(
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
+ language_threshold: float = 0.1,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -532,53 +549,5 @@ def from_path(
cp_exclusion,
preemptive_behaviour,
explain,
+ language_threshold,
)
-
-
-def normalize(
- path: "PathLike[Any]",
- steps: int = 5,
- chunk_size: int = 512,
- threshold: float = 0.20,
- cp_isolation: Optional[List[str]] = None,
- cp_exclusion: Optional[List[str]] = None,
- preemptive_behaviour: bool = True,
-) -> CharsetMatch:
- """
- Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
- """
- warnings.warn(
- "normalize is deprecated and will be removed in 3.0",
- DeprecationWarning,
- )
-
- results = from_path(
- path,
- steps,
- chunk_size,
- threshold,
- cp_isolation,
- cp_exclusion,
- preemptive_behaviour,
- )
-
- filename = basename(path)
- target_extensions = list(splitext(filename))
-
- if len(results) == 0:
- raise IOError(
- 'Unable to normalize "{}", no encoding charset seems to fit.'.format(
- filename
- )
- )
-
- result = results.best()
-
- target_extensions[0] += "-" + result.encoding # type: ignore
-
- with open(
- "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
- ) as fp:
- fp.write(result.output()) # type: ignore
-
- return result # type: ignore
diff --git a/charset_normalizer/assets/__init__.py b/charset_normalizer/assets/__init__.py
index 3c33ba30..9075930d 100644
--- a/charset_normalizer/assets/__init__.py
+++ b/charset_normalizer/assets/__init__.py
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
from typing import Dict, List
+# Language label that contain the em dash "—"
+# character are to be considered alternative seq to origin
FREQUENCIES: Dict[str, List[str]] = {
"English": [
"e",
@@ -30,6 +32,34 @@
"z",
"q",
],
+ "English—": [
+ "e",
+ "a",
+ "t",
+ "i",
+ "o",
+ "n",
+ "s",
+ "r",
+ "h",
+ "l",
+ "d",
+ "c",
+ "m",
+ "u",
+ "f",
+ "p",
+ "g",
+ "w",
+ "b",
+ "y",
+ "v",
+ "k",
+ "j",
+ "x",
+ "z",
+ "q",
+ ],
"German": [
"e",
"n",
@@ -226,33 +256,303 @@
"ж",
"ц",
],
+ # Jap-Kanji
"Japanese": [
+ "人",
+ "一",
+ "大",
+ "亅",
+ "丁",
+ "丨",
+ "竹",
+ "笑",
+ "口",
+ "日",
+ "今",
+ "二",
+ "彳",
+ "行",
+ "十",
+ "土",
+ "丶",
+ "寸",
+ "寺",
+ "時",
+ "乙",
+ "丿",
+ "乂",
+ "气",
+ "気",
+ "冂",
+ "巾",
+ "亠",
+ "市",
+ "目",
+ "儿",
+ "見",
+ "八",
+ "小",
+ "凵",
+ "県",
+ "月",
+ "彐",
+ "門",
+ "間",
+ "木",
+ "東",
+ "山",
+ "出",
+ "本",
+ "中",
+ "刀",
+ "分",
+ "耳",
+ "又",
+ "取",
+ "最",
+ "言",
+ "田",
+ "心",
+ "思",
+ "刂",
+ "前",
+ "京",
+ "尹",
+ "事",
+ "生",
+ "厶",
+ "云",
+ "会",
+ "未",
+ "来",
+ "白",
+ "冫",
+ "楽",
+ "灬",
+ "馬",
+ "尸",
+ "尺",
+ "駅",
+ "明",
+ "耂",
+ "者",
+ "了",
+ "阝",
+ "都",
+ "高",
+ "卜",
+ "占",
+ "厂",
+ "广",
+ "店",
+ "子",
+ "申",
+ "奄",
+ "亻",
+ "俺",
+ "上",
+ "方",
+ "冖",
+ "学",
+ "衣",
+ "艮",
+ "食",
+ "自",
+ ],
+ # Jap-Katakana
+ "Japanese—": [
+ "ー",
+ "ン",
+ "ス",
+ "・",
+ "ル",
+ "ト",
+ "リ",
+ "イ",
+ "ア",
+ "ラ",
+ "ッ",
+ "ク",
+ "ド",
+ "シ",
+ "レ",
+ "ジ",
+ "タ",
+ "フ",
+ "ロ",
+ "カ",
+ "テ",
+ "マ",
+ "ィ",
+ "グ",
+ "バ",
+ "ム",
+ "プ",
+ "オ",
+ "コ",
+ "デ",
+ "ニ",
+ "ウ",
+ "メ",
+ "サ",
+ "ビ",
+ "ナ",
+ "ブ",
+ "ャ",
+ "エ",
+ "ュ",
+ "チ",
+ "キ",
+ "ズ",
+ "ダ",
+ "パ",
+ "ミ",
+ "ェ",
+ "ョ",
+ "ハ",
+ "セ",
+ "ベ",
+ "ガ",
+ "モ",
+ "ツ",
+ "ネ",
+ "ボ",
+ "ソ",
+ "ノ",
+ "ァ",
+ "ヴ",
+ "ワ",
+ "ポ",
+ "ペ",
+ "ピ",
+ "ケ",
+ "ゴ",
+ "ギ",
+ "ザ",
+ "ホ",
+ "ゲ",
+ "ォ",
+ "ヤ",
+ "ヒ",
+ "ユ",
+ "ヨ",
+ "ヘ",
+ "ゼ",
+ "ヌ",
+ "ゥ",
+ "ゾ",
+ "ヶ",
+ "ヂ",
+ "ヲ",
+ "ヅ",
+ "ヵ",
+ "ヱ",
+ "ヰ",
+ "ヮ",
+ "ヽ",
+ "゠",
+ "ヾ",
+ "ヷ",
+ "ヿ",
+ "ヸ",
+ "ヹ",
+ "ヺ",
+ ],
+ # Jap-Hiragana
+ "Japanese——": [
"の",
"に",
"る",
"た",
- "は",
- "ー",
"と",
+ "は",
"し",
+ "い",
"を",
"で",
"て",
"が",
- "い",
- "ン",
- "れ",
"な",
- "年",
- "ス",
- "っ",
- "ル",
+ "れ",
"か",
"ら",
- "あ",
"さ",
- "も",
+ "っ",
"り",
+ "す",
+ "あ",
+ "も",
+ "こ",
+ "ま",
+ "う",
+ "く",
+ "よ",
+ "き",
+ "ん",
+ "め",
+ "お",
+ "け",
+ "そ",
+ "つ",
+ "だ",
+ "や",
+ "え",
+ "ど",
+ "わ",
+ "ち",
+ "み",
+ "せ",
+ "じ",
+ "ば",
+ "へ",
+ "び",
+ "ず",
+ "ろ",
+ "ほ",
+ "げ",
+ "む",
+ "べ",
+ "ひ",
+ "ょ",
+ "ゆ",
+ "ぶ",
+ "ご",
+ "ゃ",
+ "ね",
+ "ふ",
+ "ぐ",
+ "ぎ",
+ "ぼ",
+ "ゅ",
+ "づ",
+ "ざ",
+ "ぞ",
+ "ぬ",
+ "ぜ",
+ "ぱ",
+ "ぽ",
+ "ぷ",
+ "ぴ",
+ "ぃ",
+ "ぁ",
+ "ぇ",
+ "ぺ",
+ "ゞ",
+ "ぢ",
+ "ぉ",
+ "ぅ",
+ "ゐ",
+ "ゝ",
+ "ゑ",
+ "゛",
+ "゜",
+ "ゎ",
+ "ゔ",
+ "゚",
+ "ゟ",
+ "゙",
+ "ゕ",
+ "ゖ",
],
"Portuguese": [
"a",
@@ -340,6 +640,77 @@
"就",
"出",
"会",
+ "可",
+ "也",
+ "你",
+ "对",
+ "生",
+ "能",
+ "而",
+ "子",
+ "那",
+ "得",
+ "于",
+ "着",
+ "下",
+ "自",
+ "之",
+ "年",
+ "过",
+ "发",
+ "后",
+ "作",
+ "里",
+ "用",
+ "道",
+ "行",
+ "所",
+ "然",
+ "家",
+ "种",
+ "事",
+ "成",
+ "方",
+ "多",
+ "经",
+ "么",
+ "去",
+ "法",
+ "学",
+ "如",
+ "都",
+ "同",
+ "现",
+ "当",
+ "没",
+ "动",
+ "面",
+ "起",
+ "看",
+ "定",
+ "天",
+ "分",
+ "还",
+ "进",
+ "好",
+ "小",
+ "部",
+ "其",
+ "些",
+ "主",
+ "样",
+ "理",
+ "心",
+ "她",
+ "本",
+ "前",
+ "开",
+ "但",
+ "因",
+ "只",
+ "从",
+ "想",
+ "实",
],
"Ukrainian": [
"о",
@@ -956,34 +1327,6 @@
"ö",
"y",
],
- "Simple English": [
- "e",
- "a",
- "t",
- "i",
- "o",
- "n",
- "s",
- "r",
- "h",
- "l",
- "d",
- "c",
- "m",
- "u",
- "f",
- "p",
- "g",
- "w",
- "b",
- "y",
- "v",
- "k",
- "j",
- "x",
- "z",
- "q",
- ],
"Thai": [
"า",
"น",
@@ -1066,31 +1409,6 @@
"ஒ",
"ஸ",
],
- "Classical Chinese": [
- "之",
- "年",
- "為",
- "也",
- "以",
- "一",
- "人",
- "其",
- "者",
- "國",
- "有",
- "二",
- "十",
- "於",
- "曰",
- "三",
- "不",
- "大",
- "而",
- "子",
- "中",
- "五",
- "四",
- ],
"Kazakh": [
"а",
"ы",
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
index ee4b7424..ae2813fb 100644
--- a/charset_normalizer/cd.py
+++ b/charset_normalizer/cd.py
@@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
- return ["Chinese", "Classical Chinese"]
+ return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
@@ -179,22 +179,45 @@ def characters_popularity_compare(
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
- for character in ordered_characters:
+ ordered_characters_count: int = len(ordered_characters)
+ target_language_characters_count: int = len(FREQUENCIES[language])
+
+ large_alphabet: bool = target_language_characters_count > 26
+
+ for character, character_rank in zip(
+ ordered_characters, range(0, ordered_characters_count)
+ ):
if character not in FREQUENCIES_language_set:
continue
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
+ expected_projection_ratio: float = (
+ target_language_characters_count / ordered_characters_count
+ )
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
+
+ if (
+ large_alphabet is False
+ and abs(character_rank_projection - character_rank_in_language) > 4
+ ):
+ continue
+
+ if (
+ large_alphabet is True
+ and abs(character_rank_projection - character_rank_in_language)
+ < target_language_characters_count / 3
+ ):
+ character_approved_count += 1
+ continue
+
characters_before_source: List[str] = FREQUENCIES[language][
- 0 : FREQUENCIES[language].index(character)
+ 0:character_rank_in_language
]
characters_after_source: List[str] = FREQUENCIES[language][
- FREQUENCIES[language].index(character) :
- ]
- characters_before: List[str] = ordered_characters[
- 0 : ordered_characters.index(character)
- ]
- characters_after: List[str] = ordered_characters[
- ordered_characters.index(character) :
+ character_rank_in_language:
]
+ characters_before: List[str] = ordered_characters[0:character_rank]
+ characters_after: List[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
@@ -289,6 +312,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
return sorted(merge, key=lambda x: x[1], reverse=True)
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+ """
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
+ of "English". This function only keeps the best match and remove the em-dash in it.
+ """
+ index_results: Dict[str, List[float]] = dict()
+
+ for result in results:
+ language, ratio = result
+ no_em_name: str = language.replace("—", "")
+
+ if no_em_name not in index_results:
+ index_results[no_em_name] = []
+
+ index_results[no_em_name].append(ratio)
+
+ if any(len(index_results[e]) > 1 for e in index_results):
+ filtered_results: CoherenceMatches = []
+
+ for language in index_results:
+ filtered_results.append((language, max(index_results[language])))
+
+ return filtered_results
+
+ return results
+
+
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
@@ -336,4 +386,6 @@ def coherence_ratio(
if sufficient_match_count >= 3:
break
- return sorted(results, key=lambda x: x[1], reverse=True)
+ return sorted(
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+ )
diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
index b8b652a5..ad26b4d0 100644
--- a/charset_normalizer/cli/normalizer.py
+++ b/charset_normalizer/cli/normalizer.py
@@ -1,15 +1,12 @@
import argparse
import sys
from json import dumps
-from os.path import abspath
+from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
+from unicodedata import unidata_version
-try:
- from unicodedata2 import unidata_version
-except ImportError:
- from unicodedata import unidata_version
-
+import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
@@ -124,8 +121,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
parser.add_argument(
"--version",
action="version",
- version="Charset-Normalizer {} - Python {} - Unicode {}".format(
- __version__, python_version(), unidata_version
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
+ __version__,
+ python_version(),
+ unidata_version,
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
@@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
my_file.close()
continue
- o_: List[str] = my_file.name.split(".")
+ dir_path = dirname(realpath(my_file.name))
+ file_name = basename(realpath(my_file.name))
+
+ o_: List[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
@@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
continue
try:
- x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
index e679f79c..3188108d 100644
--- a/charset_normalizer/constant.py
+++ b/charset_normalizer/constant.py
@@ -489,8 +489,6 @@
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
-NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
-
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
# Logging LEVEL below DEBUG
diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py
index cdebe2b8..b266d176 100644
--- a/charset_normalizer/legacy.py
+++ b/charset_normalizer/legacy.py
@@ -1,9 +1,7 @@
-import warnings
from typing import Dict, Optional, Union
-from .api import from_bytes, from_fp, from_path, normalize
+from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
-from .models import CharsetMatch, CharsetMatches
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
@@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
"language": language,
"confidence": confidence,
}
-
-
-class CharsetNormalizerMatch(CharsetMatch):
- pass
-
-
-class CharsetNormalizerMatches(CharsetMatches):
- @staticmethod
- def from_fp(*args, **kwargs): # type: ignore
- warnings.warn( # pragma: nocover
- "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
- "and scheduled to be removed in 3.0",
- DeprecationWarning,
- )
- return from_fp(*args, **kwargs) # pragma: nocover
-
- @staticmethod
- def from_bytes(*args, **kwargs): # type: ignore
- warnings.warn( # pragma: nocover
- "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
- "and scheduled to be removed in 3.0",
- DeprecationWarning,
- )
- return from_bytes(*args, **kwargs) # pragma: nocover
-
- @staticmethod
- def from_path(*args, **kwargs): # type: ignore
- warnings.warn( # pragma: nocover
- "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
- "and scheduled to be removed in 3.0",
- DeprecationWarning,
- )
- return from_path(*args, **kwargs) # pragma: nocover
-
- @staticmethod
- def normalize(*args, **kwargs): # type: ignore
- warnings.warn( # pragma: nocover
- "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
- "and scheduled to be removed in 3.0",
- DeprecationWarning,
- )
- return normalize(*args, **kwargs) # pragma: nocover
-
-
-class CharsetDetector(CharsetNormalizerMatches):
- pass
-
-
-class CharsetDoctor(CharsetNormalizerMatches):
- pass
diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
index 31808af8..56e9321a 100644
--- a/charset_normalizer/md.py
+++ b/charset_normalizer/md.py
@@ -1,7 +1,12 @@
from functools import lru_cache
+from logging import getLogger
from typing import List, Optional
-from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
+from .constant import (
+ COMMON_SAFE_ASCII_CHARACTERS,
+ TRACE,
+ UNICODE_SECONDARY_RANGE_KEYWORD,
+)
from .utils import (
is_accentuated,
is_ascii,
@@ -123,7 +128,7 @@ def reset(self) -> None: # pragma: no cover
@property
def ratio(self) -> float:
- if self._character_count == 0:
+ if self._character_count == 0 or self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
@@ -547,7 +552,20 @@ def mess_ratio(
break
if debug:
+ logger = getLogger("charset_normalizer")
+
+ logger.log(
+ TRACE,
+ "Mess-detector extended-analysis start. "
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
+ f"maximum_threshold={maximum_threshold}",
+ )
+
+ if len(decoded_sequence) > 16:
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
+
for dt in detectors: # pragma: nocover
- print(dt.__class__, dt.ratio)
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)
diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
index bc16bfb6..7f8ca389 100644
--- a/charset_normalizer/models.py
+++ b/charset_normalizer/models.py
@@ -1,22 +1,9 @@
-import warnings
-from collections import Counter
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
-from re import sub
-from typing import (
- Any,
- Counter as TypeCounter,
- Dict,
- Iterator,
- List,
- Optional,
- Tuple,
- Union,
-)
-
-from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
-from .md import mess_ratio
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+from .constant import TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
@@ -78,45 +65,6 @@ def __lt__(self, other: object) -> bool:
def multi_byte_usage(self) -> float:
return 1.0 - len(str(self)) / len(self.raw)
- @property
- def chaos_secondary_pass(self) -> float:
- """
- Check once again chaos in decoded text, except this time, with full content.
- Use with caution, this can be very slow.
- Notice: Will be removed in 3.0
- """
- warnings.warn(
- "chaos_secondary_pass is deprecated and will be removed in 3.0",
- DeprecationWarning,
- )
- return mess_ratio(str(self), 1.0)
-
- @property
- def coherence_non_latin(self) -> float:
- """
- Coherence ratio on the first non-latin language detected if ANY.
- Notice: Will be removed in 3.0
- """
- warnings.warn(
- "coherence_non_latin is deprecated and will be removed in 3.0",
- DeprecationWarning,
- )
- return 0.0
-
- @property
- def w_counter(self) -> TypeCounter[str]:
- """
- Word counter instance on decoded text.
- Notice: Will be removed in 3.0
- """
- warnings.warn(
- "w_counter is deprecated and will be removed in 3.0", DeprecationWarning
- )
-
- string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
-
- return Counter(string_printable_only.split())
-
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
@@ -252,18 +200,6 @@ def could_be_from_charset(self) -> List[str]:
"""
return [self._encoding] + [m.encoding for m in self._leaves]
- def first(self) -> "CharsetMatch":
- """
- Kept for BC reasons. Will be removed in 3.0.
- """
- return self
-
- def best(self) -> "CharsetMatch":
- """
- Kept for BC reasons. Will be removed in 3.0.
- """
- return self
-
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
index 859f212b..425d8365 100644
--- a/charset_normalizer/utils.py
+++ b/charset_normalizer/utils.py
@@ -1,12 +1,6 @@
-try:
- # WARNING: unicodedata2 support is going to be removed in 3.0
- # Python is quickly catching up.
- import unicodedata2 as unicodedata
-except ImportError:
- import unicodedata # type: ignore[no-redef]
-
import importlib
import logging
+import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
index 64c0dbde..25bf3bcf 100644
--- a/charset_normalizer/version.py
+++ b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
Expose version
"""
-__version__ = "2.1.1"
+__version__ = "3.0.0rc1"
VERSION = __version__.split(".")
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 8e77fe94..91e06b88 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,10 +1,24 @@
-pytest
-pytest-cov
-codecov
-chardet>=5.0,<5.1
-Flask>=2.0,<3.0
-requests>=2.26,<3.0
-black==22.8.0
-flake8==5.0.4
-mypy==0.971
-isort
+flake8==5.0.4
+chardet==5.0.0
+isort==5.10.1
+codecov==2.1.12
+pytest-cov==4.0.0
+build==0.8.0
+wheel==0.37.1
+
+# The vast majority of project dropped Python 3.6
+# This is to ensure build are reproducible >=3.6
+black==22.8.0; python_version < "3.7"
+black==22.10.0; python_version >= "3.7"
+
+mypy==0.982; python_version >= "3.7"
+mypy==0.971; python_version < "3.7"
+
+Flask==2.2.2; python_version >= "3.7"
+Flask==2.0.3; python_version < "3.7"
+
+pytest==7.0.0; python_version < "3.7"
+pytest==7.1.3; python_version >= "3.7"
+
+requests==2.27.1; python_version < "3.7"
+requests==2.28.1; python_version >= "3.7"
diff --git a/docs/api.rst b/docs/api.rst
index 47a985e5..48b74951 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -14,11 +14,9 @@ Those functions are publicly exposed and are protected through our BC guarantee.
.. autofunction:: from_fp
.. autofunction:: from_path
-.. autofunction:: normalize
-
-.. autoclass:: charset_normalizer.CharsetMatches
+.. autoclass:: charset_normalizer.models.CharsetMatches
:inherited-members:
-.. autoclass:: charset_normalizer.CharsetMatch
+.. autoclass:: charset_normalizer.models.CharsetMatch
:inherited-members:
.. autofunction:: detect
@@ -99,3 +97,8 @@ Some reusable functions used across the project. We do not guarantee the BC in t
.. autofunction:: charset_normalizer.utils.range_scan
.. autofunction:: charset_normalizer.utils.is_cp_similar
+
+
+.. class:: os.PathLike
+
+ Used as a generic way to accept AnyStr for paths.
diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst
new file mode 100644
index 00000000..ea45b297
--- /dev/null
+++ b/docs/community/speedup.rst
@@ -0,0 +1,43 @@
+Optional speedup extension
+===========================
+
+Why?
+-------
+
+charset-normalizer will always remain pure Python, meaning that a environment without any build-capabilities will
+run this program without any additional requirements.
+
+Nonetheless, starting from the version 3.0 we introduce and publish some platform specific wheels including a
+pre-build extension.
+
+Most of the time is spent in the module `md.py` so we decided to "compile it" using Mypyc.
+
+(1) It does not require to have a separate code base
+(2) Our project code base is rather simple and lightweight
+(3) Mypyc is robust enough today
+(4) Four times faster!
+
+How?
+-------
+
+If your platform and/or architecture is not served by this swift optimization you may compile it easily yourself.
+Following those instructions (provided you have the necessary toolchain installed):
+
+ ::
+
+ export CHARSET_NORMALIZER_USE_MYPYC=1
+ pip install mypy build wheel
+ pip install charset-normalizer --no-binary :all:
+
+
+How not to?
+-----------
+
+You may install charset-normalizer without any specific (pre-built wheel) by directly using the universal wheel
+(most likely hosted on PyPi or any valid mirror you use)
+
+ ::
+
+ pip install charset-normalizer --no-binary :all:
+
+Directly.
diff --git a/docs/community/why_migrate.rst b/docs/community/why_migrate.rst
index 717fc3b5..1909c770 100644
--- a/docs/community/why_migrate.rst
+++ b/docs/community/why_migrate.rst
@@ -4,13 +4,13 @@ Why should I migrate to Charset-Normalizer?
There is so many reason to migrate your current project. Here are some of them:
- Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly).
-- X5 faster than Chardet in average and X3 faster in 99% of the cases AND support 3 times more encoding.
+- X10 faster than Chardet in average and X6 faster in 99% of the cases AND support 3 times more encoding.
- Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError!
- Actively maintained, open to contributors.
- Have the backward compatible function ``detect`` that come from Chardet.
- Truly detect the language used in the text.
- It is, for the first time, really universal! As there is no specific probe per charset.
-- The package size is X4 lower than Chardet's (5.0)!
+- The package size is X2~X4 lower than Chardet's (5.0)! (Depends on your arch)
- Propose much more options/public kwargs to tweak the detection as you sees fit!
- Using static typing to ease your development.
- Detect Unicode content better than Chardet or cChardet does.
diff --git a/docs/conf.py b/docs/conf.py
index 5cfe028b..3e675d42 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -81,7 +81,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
@@ -113,7 +113,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = []
# -- Options for HTMLHelp output ------------------------------------------
diff --git a/docs/index.rst b/docs/index.rst
index 2398a7f0..05d5f98a 100755
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,7 +13,6 @@ It aims to be as generic as possible.
.. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df
:width: 500px
- :scale: 100 %
:alt: CLI Charset Normalizer
:align: right
@@ -72,6 +71,7 @@ Community Guide
.. toctree::
:maxdepth: 2
+ community/speedup
community/faq
community/why_migrate
diff --git a/docs/user/advanced_search.rst b/docs/user/advanced_search.rst
index b4441e58..a269cd10 100644
--- a/docs/user/advanced_search.rst
+++ b/docs/user/advanced_search.rst
@@ -18,7 +18,8 @@ As follow ::
cp_isolation=None, # Finite list of encoding to use when searching for a match
cp_exclusion=None, # Finite list of encoding to avoid when searching for a match
preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding
- explain=False # Print on screen what is happening when searching for a match
+ explain=False, # Print on screen what is happening when searching for a match
+ language_threshold=0.1 # Minimum coherence ratio / language ratio match accepted
)
diff --git a/docs/user/support.rst b/docs/user/support.rst
index 8b624933..0dbf06b9 100644
--- a/docs/user/support.rst
+++ b/docs/user/support.rst
@@ -92,13 +92,10 @@ mac_iceland maciceland
mac_latin2 maccentraleurope, maclatin2
mac_roman macintosh, macroman
mac_turkish macturkish
-mbcs ansi, dbcs
ptcp154 csptcp154, pt154, cp154, cyrillic_asian
-rot_13 rot13
shift_jis csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese
shift_jis_2004 shiftjis2004, sjis_2004, s_jis_2004
shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
-tactis tis260
tis_620 tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166
utf_16 u16, utf16
utf_16_be unicodebigunmarked, utf_16be
@@ -107,9 +104,11 @@ utf_32 u32, utf32
utf_32_be utf_32be
utf_32_le utf_32le
utf_8 u8, utf, utf8, utf8_ucs2, utf8_ucs4 (+utf_8_sig)
-utf_7 u7, unicode-1-1-utf-7
+utf_7* u7, unicode-1-1-utf-7
=============== ===============================================================================================================================
+*: Only if a SIG/mark is found.
+
-------------------
Supported Languages
-------------------
diff --git a/setup.cfg b/setup.cfg
index bb4f9c50..8000f5cd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,59 @@
+[metadata]
+name = charset-normalizer
+description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
+long_description = file: README.md, CHANGELOG.md, LICENSE
+long_description_content_type = text/markdown
+keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect
+url = https://github.com/Ousret/charset_normalizer
+license = MIT
+author_email = ahmed.tahri@cloudnursery.dev
+author = Ahmed TAHRI
+python_requires = >=3.6.0
+project_urls =
+ Bug Reports = https://github.com/Ousret/charset_normalizer/issues
+ Documentation = https://charset-normalizer.readthedocs.io/en/latest
+classifiers =
+ Development Status :: 5 - Production/Stable
+ License :: OSI Approved :: MIT License
+ Intended Audience :: Developers
+ Topic :: Software Development :: Libraries :: Python Modules
+ Operating System :: OS Independent
+ Programming Language :: Python
+ Programming Language :: Python :: 3
+ Programming Language :: Python :: 3.6
+ Programming Language :: Python :: 3.7
+ Programming Language :: Python :: 3.8
+ Programming Language :: Python :: 3.9
+ Programming Language :: Python :: 3.10
+ Programming Language :: Python :: 3.11
+ Programming Language :: Python :: Implementation :: PyPy
+ Topic :: Text Processing :: Linguistic
+ Topic :: Utilities
+ Typing :: Typed
+
+[options.packages.find]
+exclude =
+ tests
+ *.tests
+ *.tests.*
+ tests.*
+ docs*
+ data*
+
+[options.extras_require]
+unicode_backport =
+
+[options.entry_points]
+console_scripts =
+ normalizer = charset_normalizer.cli.normalizer:cli_detect
+
+[options]
+packages = find:
+include_package_data = True
+
+[options.package_data]
+charset_normalizer = py.typed
+
[tool:pytest]
addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs
@@ -11,4 +67,4 @@ ignore_missing_imports = True
[tool:isort]
profile = black
-combine_as_imports = True
\ No newline at end of file
+combine_as_imports = True
diff --git a/setup.py b/setup.py
index 298d12be..7c64a695 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import io
import os
+import sys
from re import search
-from setuptools import find_packages, setup
+from setuptools import setup
def get_version():
@@ -14,73 +14,25 @@ def get_version():
version_file.read()).group('version')
-# Package meta-data.
-NAME = 'charset-normalizer'
-DESCRIPTION = 'The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.'
-URL = 'https://github.com/ousret/charset_normalizer'
-EMAIL = 'ahmed.tahri@cloudnursery.dev'
-AUTHOR = 'Ahmed TAHRI @Ousret'
-REQUIRES_PYTHON = '>=3.6.0'
-VERSION = get_version()
+USE_MYPYC = False
-REQUIRED = []
+if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc":
+ sys.argv.pop(1)
+ USE_MYPYC = True
+if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
+ USE_MYPYC = True
-EXTRAS = {
- 'unicode_backport': ['unicodedata2']
-}
+if USE_MYPYC:
+ from mypyc.build import mypycify
-here = os.path.abspath(os.path.dirname(__file__))
-
-try:
- with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
- long_description = '\n' + f.read()
-except FileNotFoundError:
- long_description = DESCRIPTION
+ MYPYC_MODULES = mypycify([
+ "charset_normalizer/md.py"
+ ])
+else:
+ MYPYC_MODULES = None
setup(
- name=NAME,
- version=VERSION,
- description=DESCRIPTION,
- long_description=long_description.replace(':heavy_check_mark:', '✅'),
- long_description_content_type='text/markdown',
- author=AUTHOR,
- author_email=EMAIL,
- python_requires=REQUIRES_PYTHON,
- url=URL,
- keywords=['encoding', 'i18n', 'txt', 'text', 'charset', 'charset-detector', 'normalization', 'unicode', 'chardet'],
- packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
- install_requires=REQUIRED,
- extras_require=EXTRAS,
- include_package_data=True,
- package_data={"charset_normalizer": ["py.typed"]},
- license='MIT',
- entry_points={
- 'console_scripts':
- [
- 'normalizer = charset_normalizer.cli.normalizer:cli_detect'
- ]
- },
- classifiers=[
- 'Development Status :: 5 - Production/Stable',
- 'License :: OSI Approved :: MIT License',
- 'Intended Audience :: Developers',
- 'Topic :: Software Development :: Libraries :: Python Modules',
- 'Operating System :: OS Independent',
- 'Programming Language :: Python',
- 'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
- 'Programming Language :: Python :: 3.10',
- 'Programming Language :: Python :: 3.11',
- 'Topic :: Text Processing :: Linguistic',
- 'Topic :: Utilities',
- 'Programming Language :: Python :: Implementation :: PyPy',
- 'Typing :: Typed'
- ],
- project_urls={
- 'Bug Reports': 'https://github.com/Ousret/charset_normalizer/issues',
- 'Documentation': 'https://charset-normalizer.readthedocs.io/en/latest',
- },
+ name="charset-normalizer",
+ version=get_version(),
+ ext_modules=MYPYC_MODULES
)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 16601750..d42bf46b 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,7 +2,12 @@
from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
from unittest.mock import patch
from os.path import exists
-from os import remove
+from os import remove, path, pardir
+
+DIR_PATH = path.join(
+ path.dirname(path.realpath(__file__)),
+ pardir
+)
class TestCommandLineInterface(unittest.TestCase):
@@ -24,24 +29,33 @@ def test_single_file(self):
self.assertEqual(
0,
cli_detect(
- ['./data/sample-arabic-1.txt']
+ [DIR_PATH + '/data/sample-arabic-1.txt']
)
)
+ def test_version_output_success(self):
+ with self.assertRaises(SystemExit):
+ cli_detect(
+ ['--version']
+ )
+
def test_single_file_normalize(self):
self.assertEqual(
0,
cli_detect(
- ['./data/sample-arabic-1.txt', '--normalize']
+ [
+ DIR_PATH + '/data/sample-arabic-1.txt',
+ '--normalize'
+ ]
)
)
self.assertTrue(
- exists('./data/sample-arabic-1.cp1256.txt')
+ exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
)
try:
- remove('./data/sample-arabic-1.cp1256.txt')
+ remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt')
except:
pass
@@ -49,7 +63,7 @@ def test_single_verbose_file(self):
self.assertEqual(
0,
cli_detect(
- ['./data/sample-arabic-1.txt', '--verbose']
+ [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose']
)
)
@@ -58,9 +72,9 @@ def test_multiple_file(self):
0,
cli_detect(
[
- './data/sample-arabic-1.txt',
- './data/sample-french.txt',
- './data/sample-chinese.txt'
+ DIR_PATH + '/data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-french.txt',
+ DIR_PATH + '/data/sample-chinese.txt'
]
)
)
@@ -71,9 +85,9 @@ def test_with_alternative(self):
cli_detect(
[
'-a',
- './data/sample-arabic-1.txt',
- './data/sample-french.txt',
- './data/sample-chinese.txt'
+ DIR_PATH + '/data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-french.txt',
+ DIR_PATH + '/data/sample-chinese.txt'
]
)
)
@@ -84,9 +98,9 @@ def test_with_minimal_output(self):
cli_detect(
[
'-m',
- './data/sample-arabic-1.txt',
- './data/sample-french.txt',
- './data/sample-chinese.txt'
+ DIR_PATH + '/data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-french.txt',
+ DIR_PATH + '/data/sample-chinese.txt'
]
)
)
@@ -98,9 +112,9 @@ def test_with_minimal_and_alt(self):
[
'-m',
'-a',
- './data/sample-arabic-1.txt',
- './data/sample-french.txt',
- './data/sample-chinese.txt'
+ DIR_PATH + '/data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-french.txt',
+ DIR_PATH + '/data/sample-chinese.txt'
]
)
)
@@ -109,7 +123,7 @@ def test_non_existent_file(self):
with self.assertRaises(SystemExit) as cm:
cli_detect(
- ['./data/not_found_data.txt']
+ [DIR_PATH + '/data/not_found_data.txt']
)
self.assertEqual(cm.exception.code, 2)
@@ -119,7 +133,7 @@ def test_replace_without_normalize(self):
self.assertEqual(
cli_detect(
[
- './data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-arabic-1.txt',
'--replace'
]
),
@@ -130,7 +144,7 @@ def test_force_replace_without_replace(self):
self.assertEqual(
cli_detect(
[
- './data/sample-arabic-1.txt',
+ DIR_PATH + '/data/sample-arabic-1.txt',
'--force'
]
),
diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py
index 6ad95927..7e399132 100644
--- a/tests/test_coherence_detection.py
+++ b/tests/test_coherence_detection.py
@@ -1,5 +1,5 @@
import pytest
-from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features
+from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches
@pytest.mark.parametrize(
@@ -39,3 +39,18 @@ def test_target_features(language, expected_have_accents, expected_pure_latin):
assert target_have_accents is expected_have_accents
assert target_pure_latin is expected_pure_latin
+
+
+@pytest.mark.parametrize(
+ "matches, expected_return",
+ [
+ ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]),
+ ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]),
+ ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]),
+ ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]),
+ ]
+)
+def test_filter_alt_coherence_matches(matches, expected_return):
+ results = filter_alt_coherence_matches(matches)
+
+ assert results == expected_return
diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py
index 96e0b797..adff8801 100644
--- a/tests/test_full_detection.py
+++ b/tests/test_full_detection.py
@@ -1,5 +1,11 @@
from charset_normalizer.api import from_path
import pytest
+from os import path, pardir
+
+DIR_PATH = path.join(
+ path.dirname(path.realpath(__file__)),
+ pardir
+)
@pytest.mark.parametrize(
@@ -10,7 +16,7 @@
('sample-arabic.txt', 'utf_8', 'Arabic'),
('sample-russian-3.txt', 'utf_8', 'Russian'),
('sample-french.txt', 'utf_8', 'French'),
- ('sample-chinese.txt', 'big5', 'Classical Chinese'),
+ ('sample-chinese.txt', 'big5', 'Chinese'),
('sample-greek.txt', 'cp1253', 'Greek'),
('sample-greek-2.txt', 'cp1253', 'Greek'),
('sample-hebrew-2.txt', 'cp1255', 'Hebrew'),
@@ -30,7 +36,7 @@ def test_elementary_detection(
expected_charset: str,
expected_language: str,
):
- best_guess = from_path("./data/{}".format(input_data_file)).best()
+ best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best()
assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file)
assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file)
diff --git a/tests/test_normalize_fp.py b/tests/test_normalize_fp.py
deleted file mode 100644
index e2ce364a..00000000
--- a/tests/test_normalize_fp.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-from charset_normalizer import normalize
-from os.path import exists
-from os import unlink
-
-
-def test_normalize_fp_creation():
- guesses = normalize(
- "./data/sample-arabic-1.txt"
- )
-
- predicted_path = "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding)
- path_exist = exists(
- "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding)
- )
-
- assert path_exist is True
-
- if path_exist:
- unlink(predicted_path)