From e2aa46d1f600e6e633e7112389386df502424405 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:48:25 +0330 Subject: [PATCH 1/9] add support for python 3.13, numpy 2, and newer versions of sklearn and pandas, upgrade dev dependencies --- pyproject.toml | 9 ++++----- requirements.txt | 1 - requirements_dev.txt | 4 ++-- setup.py | 10 +++++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f7639bc..18c84bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ keywords = [ "bayes", "naive bayes", "classifier", - "probabilistic", ] classifiers = [ "Intended Audience :: Science/Research", @@ -31,12 +30,12 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "License :: OSI Approved :: BSD License", ] -requires-python = ">=3.8,<3.13" +requires-python = ">=3.8,<3.14" dependencies = [ "pandas>=1.4.1", - "numpy<2.0.0", "scipy>=1.8.0", "scikit-learn>=1.0.2", "typing-extensions>=4.8.0; python_full_version < '3.11'", @@ -49,9 +48,9 @@ Source = "https://github.com/msamsami/wnb" [project.optional-dependencies] dev = [ "pytest>=7.0.0", - "black==24.8.0", + "black>=24.10.0", "tqdm", - "pre-commit", + "pre-commit>=4.0.0", "isort", ] diff --git a/requirements.txt b/requirements.txt index ea50ff2..e6652c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ pandas>=1.4.1 -numpy<2.0.0 scipy>=1.8.0 scikit-learn>=1.0.2 typing-extensions>=4.8.0; python_version < "3.11" diff --git a/requirements_dev.txt b/requirements_dev.txt index 1e3e46c..bb866a0 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,5 +1,5 @@ pytest>=7.0.0 -black==24.8.0 +black>=24.10.0 tqdm -pre-commit +pre-commit>=4.0.0 isort diff --git a/setup.py b/setup.py index b1f9089..612d377 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ name="wnb", version=__version__, description="Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.", - keywords=["python", "machine learning", "bayes", "naive bayes", "classifier", "probabilistic"], + keywords=["python", "machine learning", "bayes", "naive bayes", "classifier"], author="Mehdi Samsami", author_email="mehdisamsami@live.com", license="BSD License", @@ -32,12 +32,12 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "License :: OSI Approved :: BSD License", ], - python_requires=">=3.8,<3.13", + python_requires=">=3.8,<3.14", install_requires=[ "pandas>=1.4.1", - "numpy<2.0.0", "scipy>=1.8.0", "scikit-learn>=1.0.2", "typing-extensions>=4.8.0; python_full_version < '3.11'", @@ -45,9 +45,9 @@ extras_require={ "dev": [ "pytest>=7.0.0", - "black==24.8.0", + "black>=24.10.0", "tqdm", - "pre-commit", + "pre-commit>=4.0.0", "isort", ] }, From 26f6ca8984f42fdcf36ba078977d581a36a696fe Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:49:40 +0330 Subject: [PATCH 2/9] use factorial function from python stdlib instead of numpy to avoid error in numpy 2 --- wnb/stats/discrete.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wnb/stats/discrete.py b/wnb/stats/discrete.py index f01effc..348e9d9 100644 --- a/wnb/stats/discrete.py +++ b/wnb/stats/discrete.py @@ -1,3 +1,4 @@ +from math import factorial from typing import Any, Mapping import numpy as np @@ -82,7 +83,7 @@ def from_data(cls, data, **kwargs: Any) -> "PoissonDist": def pmf(self, x: int) -> float: return ( - (np.exp(-self.rate) * self.rate**x) / np.math.factorial(x) + (np.exp(-self.rate) * self.rate**x) / factorial(x) if x >= self._support[0] and x - int(x) == 0 else 0.0 ) From 90fe593ff581b0769d2af5856b06144c54c23b64 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:50:38 +0330 Subject: [PATCH 3/9] change estimator properties and validations to adhere to new guidelines added in sklearn 1.6 --- tests/test_gwnb.py | 6 ++++-- wnb/gnb.py | 30 ++++++++++++++++++++++++------ wnb/gwnb.py | 35 ++++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py index e0b4b15..d4ab622 100644 --- a/tests/test_gwnb.py +++ b/tests/test_gwnb.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest from sklearn.base import is_classifier @@ -131,8 +133,8 @@ def test_gwnb_non_binary(): y_ = np.array([1, 2, 3, 4, 4, 3, 2, 1, 1, 2]) clf = GaussianWNB() - msg = "Unknown label type: non-binary" - with pytest.raises(ValueError, match=msg): + pattern = re.compile(r"(Only binary classification is supported|Unknown label type: non-binary)") + with pytest.raises(ValueError, match=pattern): clf.fit(X_, y_) diff --git a/wnb/gnb.py b/wnb/gnb.py index 7605d83..b841be8 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -7,6 +7,8 @@ import numpy as np import pandas as pd +import sklearn +from packaging import version from scipy.special import logsumexp from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import DataConversionWarning @@ -14,6 +16,14 @@ from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted +if version.parse(sklearn.__version__) >= version.parse("1.6"): + from sklearn.utils.validation import validate_data +else: + + def validate_data(estimator, X, **kwargs): + return check_array(X, estimator=estimator, **kwargs) + + if sys.version_info >= (3, 11): from typing import Self else: @@ -83,6 +93,13 @@ def __init__( self.distributions = distributions self.alpha = alpha + if version.parse(sklearn.__version__) >= version.parse("1.6"): + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags + def _more_tags(self) -> dict[str, bool]: return {"requires_y": True} @@ -101,8 +118,9 @@ def _check_inputs(self, X, y) -> None: if self.n_classes_ == 1: raise ValueError("Classifier can't train when only one class is present") - X = check_array( - array=X, + X = validate_data( + self, + X, accept_sparse=False, accept_large_sparse=False, dtype=( @@ -112,7 +130,6 @@ def _check_inputs(self, X, y) -> None: ensure_2d=True, ensure_min_samples=1, ensure_min_features=1, - estimator=self, ) # Check if X contains complex values @@ -282,14 +299,15 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray: check_is_fitted(self) # Input validation - X = check_array( - array=X, + X = validate_data( + self, + X, accept_large_sparse=False, force_all_finite=True, dtype=( None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric" ), - estimator=self, + reset=False, ) # Check if the number of input features matches the data seen during fit diff --git a/wnb/gwnb.py b/wnb/gwnb.py index e33fc0d..73a1e12 100644 --- a/wnb/gwnb.py +++ b/wnb/gwnb.py @@ -8,6 +8,8 @@ import numpy as np import pandas as pd +import sklearn +from packaging import version from scipy.special import logsumexp from scipy.stats import norm from sklearn.base import BaseEstimator, ClassifierMixin @@ -16,6 +18,14 @@ from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.validation import check_is_fitted +if version.parse(sklearn.__version__) >= version.parse("1.6"): + from sklearn.utils.validation import validate_data +else: + + def validate_data(estimator, X, **kwargs): + return check_array(X, estimator=estimator, **kwargs) + + if sys.version_info >= (3, 11): from typing import Self else: @@ -111,6 +121,14 @@ def __init__( self.C = C self.learning_hist = learning_hist + if version.parse(sklearn.__version__) >= version.parse("1.6"): + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + tags.classifier_tags.multi_class = False + return tags + def _more_tags(self) -> dict[str, bool]: return {"binary_only": True, "requires_y": True} @@ -119,16 +137,20 @@ def _check_inputs(self, X, y) -> None: check_classification_targets(y) # Check that the dataset has only two unique labels - if type_of_target(y) != "binary": - warnings.warn("This version of MLD-WNB only supports binary classification.") - raise ValueError("Unknown label type: non-binary") + if (y_type := type_of_target(y)) != "binary": + if version.parse(sklearn.__version__) >= version.parse("1.6"): + msg = f"Only binary classification is supported. The type of the target is {y_type}." + else: + msg = "Unknown label type: non-binary" + raise ValueError(msg) # Check if only one class is present in label vector if self.n_classes_ == 1: raise ValueError("Classifier can't train when only one class is present.") - X = check_array( - array=X, + X = validate_data( + self, + X, accept_sparse=False, accept_large_sparse=False, dtype="numeric", @@ -136,7 +158,6 @@ def _check_inputs(self, X, y) -> None: ensure_2d=True, ensure_min_samples=1, ensure_min_features=1, - estimator=self, ) # Check if X contains complex values @@ -416,7 +437,7 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray: check_is_fitted(self) # Input validation - X = check_array(array=X, accept_large_sparse=False, force_all_finite=True, estimator=self) + X = validate_data(self, X, accept_large_sparse=False, force_all_finite=True, reset=False) # Check if the number of input features matches the data seen during fit if X.shape[1] != self.n_features_in_: From f9c9f4822be05ec019af4f6af3012d034a8c0f55 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:51:15 +0330 Subject: [PATCH 4/9] add python 3.13 to testing workflow --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 1c7cd42..1d3f4ff 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 From 1f5bf5105c31d339412245f9e31014b0b85ed0c2 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:53:14 +0330 Subject: [PATCH 5/9] rename to _utils to make it internal, fix imports accordingly --- wnb/gnb.py | 2 +- wnb/stats/{utils.py => _utils.py} | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) rename wnb/stats/{utils.py => _utils.py} (96%) diff --git a/wnb/gnb.py b/wnb/gnb.py index b841be8..9e0269f 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -30,9 +30,9 @@ def validate_data(estimator, X, **kwargs): from typing_extensions import Self from wnb.stats import Distribution, NonNumericDistributions +from wnb.stats._utils import get_dist_class, is_dist_supported from wnb.stats.base import DistMixin from wnb.stats.typing import DistributionLike -from wnb.stats.utils import get_dist_class, is_dist_supported from .typing import ArrayLike, Float, MatrixLike diff --git a/wnb/stats/utils.py b/wnb/stats/_utils.py similarity index 96% rename from wnb/stats/utils.py rename to wnb/stats/_utils.py index 9b4518e..7fc0da6 100644 --- a/wnb/stats/utils.py +++ b/wnb/stats/_utils.py @@ -6,8 +6,6 @@ from .enums import Distribution from .typing import DistributionLike -__all__ = ["is_dist_supported", "get_dist_class"] - def is_dist_supported(dist: DistributionLike) -> bool: with contextlib.suppress(TypeError): From 9d642e383432b0613fd2607b941f3e34250d7f9a Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 12:59:01 +0330 Subject: [PATCH 6/9] downgrade black version to support python 3.8 --- pyproject.toml | 2 +- requirements_dev.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 18c84bd..3a754b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ Source = "https://github.com/msamsami/wnb" [project.optional-dependencies] dev = [ "pytest>=7.0.0", - "black>=24.10.0", + "black>=24.8.0", "tqdm", "pre-commit>=4.0.0", "isort", diff --git a/requirements_dev.txt b/requirements_dev.txt index bb866a0..0c6b11a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,5 +1,5 @@ pytest>=7.0.0 -black>=24.10.0 +black>=24.8.0 tqdm pre-commit>=4.0.0 isort diff --git a/setup.py b/setup.py index 612d377..1faf44f 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ extras_require={ "dev": [ "pytest>=7.0.0", - "black>=24.10.0", + "black>=24.8.0", "tqdm", "pre-commit>=4.0.0", "isort", From 7ec1e4477fbc1ba2254319b8849cc310b6f5b304 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 13:03:57 +0330 Subject: [PATCH 7/9] bump version -> 0.4.0 --- README.md | 4 ++-- wnb/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e44fbcb..f61626f 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@
-![Lastest Release](https://img.shields.io/badge/release-v0.3.1-green) +![Lastest Release](https://img.shields.io/badge/release-v0.4.0-green) [![PyPI Version](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/) -![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
+![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
![GitHub Workflow Status (build)](https://github.com/msamsami/wnb/actions/workflows/build.yml/badge.svg) ![PyPI License](https://img.shields.io/pypi/l/wnb) [![PyPi Downloads](https://static.pepy.tech/badge/wnb)](https://pepy.tech/project/wnb) diff --git a/wnb/__init__.py b/wnb/__init__.py index 079df96..da9aba4 100644 --- a/wnb/__init__.py +++ b/wnb/__init__.py @@ -2,7 +2,7 @@ Python library for the implementations of general and weighted naive Bayes (WNB) classifiers. """ -__version__ = "0.3.1" +__version__ = "0.4.0" __author__ = "Mehdi Samsami" From 53ee486870f3d37f634a5bd4a0c0d3fa45e9c452 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 13:08:04 +0330 Subject: [PATCH 8/9] downgrade pre-commit to support python 3.8 --- pyproject.toml | 2 +- requirements_dev.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3a754b3..028fbb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dev = [ "pytest>=7.0.0", "black>=24.8.0", "tqdm", - "pre-commit>=4.0.0", + "pre-commit>=3.5.0", "isort", ] diff --git a/requirements_dev.txt b/requirements_dev.txt index 0c6b11a..fe249be 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,5 +1,5 @@ pytest>=7.0.0 black>=24.8.0 tqdm -pre-commit>=4.0.0 +pre-commit>=3.5.0 isort diff --git a/setup.py b/setup.py index 1faf44f..7175641 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ "pytest>=7.0.0", "black>=24.8.0", "tqdm", - "pre-commit>=4.0.0", + "pre-commit>=3.5.0", "isort", ] }, From b897d75c953061fb99712bf0a0b0b2f25be2b309 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Wed, 25 Dec 2024 13:30:07 +0330 Subject: [PATCH 9/9] add internal script for validate_data and sklearn version constant, update gnb and gwnb scripts accordingly --- wnb/_utils.py | 19 +++++++++++++++++++ wnb/gnb.py | 15 +++------------ wnb/gwnb.py | 17 ++++------------- 3 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 wnb/_utils.py diff --git a/wnb/_utils.py b/wnb/_utils.py new file mode 100644 index 0000000..0a1a366 --- /dev/null +++ b/wnb/_utils.py @@ -0,0 +1,19 @@ +from typing import Any + +import sklearn +from packaging import version +from sklearn.utils import check_array + +__all__ = ["SKLEARN_V1_6_OR_LATER", "validate_data"] + + +SKLEARN_V1_6_OR_LATER = version.parse(sklearn.__version__) >= version.parse("1.6") + + +if SKLEARN_V1_6_OR_LATER: + from sklearn.utils.validation import validate_data +else: + + def validate_data(estimator, X, **kwargs: Any): + kwargs.pop("reset", None) + return check_array(X, estimator=estimator, **kwargs) diff --git a/wnb/gnb.py b/wnb/gnb.py index 9e0269f..2f669e9 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -7,23 +7,13 @@ import numpy as np import pandas as pd -import sklearn -from packaging import version from scipy.special import logsumexp from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import DataConversionWarning -from sklearn.utils import as_float_array, check_array +from sklearn.utils import as_float_array from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted -if version.parse(sklearn.__version__) >= version.parse("1.6"): - from sklearn.utils.validation import validate_data -else: - - def validate_data(estimator, X, **kwargs): - return check_array(X, estimator=estimator, **kwargs) - - if sys.version_info >= (3, 11): from typing import Self else: @@ -34,6 +24,7 @@ def validate_data(estimator, X, **kwargs): from wnb.stats.base import DistMixin from wnb.stats.typing import DistributionLike +from ._utils import SKLEARN_V1_6_OR_LATER, validate_data from .typing import ArrayLike, Float, MatrixLike __all__ = ["GeneralNB"] @@ -93,7 +84,7 @@ def __init__( self.distributions = distributions self.alpha = alpha - if version.parse(sklearn.__version__) >= version.parse("1.6"): + if SKLEARN_V1_6_OR_LATER: def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/wnb/gwnb.py b/wnb/gwnb.py index 73a1e12..4c4c091 100644 --- a/wnb/gwnb.py +++ b/wnb/gwnb.py @@ -8,29 +8,20 @@ import numpy as np import pandas as pd -import sklearn -from packaging import version from scipy.special import logsumexp from scipy.stats import norm from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import DataConversionWarning -from sklearn.utils import as_float_array, check_array, deprecated +from sklearn.utils import as_float_array, deprecated from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.validation import check_is_fitted -if version.parse(sklearn.__version__) >= version.parse("1.6"): - from sklearn.utils.validation import validate_data -else: - - def validate_data(estimator, X, **kwargs): - return check_array(X, estimator=estimator, **kwargs) - - if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self +from ._utils import SKLEARN_V1_6_OR_LATER, validate_data from .typing import ArrayLike, Float, Int, MatrixLike __all__ = ["GaussianWNB"] @@ -121,7 +112,7 @@ def __init__( self.C = C self.learning_hist = learning_hist - if version.parse(sklearn.__version__) >= version.parse("1.6"): + if SKLEARN_V1_6_OR_LATER: def __sklearn_tags__(self): tags = super().__sklearn_tags__() @@ -138,7 +129,7 @@ def _check_inputs(self, X, y) -> None: # Check that the dataset has only two unique labels if (y_type := type_of_target(y)) != "binary": - if version.parse(sklearn.__version__) >= version.parse("1.6"): + if SKLEARN_V1_6_OR_LATER: msg = f"Only binary classification is supported. The type of the target is {y_type}." else: msg = "Unknown label type: non-binary"