diff --git a/.circleci/config.yml b/.circleci/config.yml index fcc43270c668f..effb37d6b8faa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -65,6 +65,22 @@ jobs: path: ~/log.txt destination: log.txt + pypy3: + docker: + - image: pypy:3-6.0.0 + steps: + - restore_cache: + keys: + - pypy3-ccache-{{ .Branch }} + - pypy3-ccache + - checkout + - run: ./build_tools/circle/build_test_pypy.sh + - save_cache: + key: pypy3-ccache-{{ .Branch }}-{{ .BuildNum }} + paths: + - ~/.ccache + - ~/.cache/pip + deploy: docker: - image: circleci/python:3.6.1 @@ -88,6 +104,7 @@ workflows: jobs: - python3 - python2 + - pypy3 - deploy: requires: - python3 diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh new file mode 100755 index 0000000000000..18fa361821d14 --- /dev/null +++ b/build_tools/circle/build_test_pypy.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -x +set -e + +apt-get -yq update +apt-get -yq install libatlas-dev libatlas-base-dev liblapack-dev gfortran ccache + +pip install virtualenv + +if command -v pypy3; then + virtualenv -p $(command -v pypy3) pypy-env +elif command -v pypy; then + virtualenv -p $(command -v pypy) pypy-env +fi + +source pypy-env/bin/activate + +python --version +which python + +pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy==1.14.4 Cython pytest +pip install "scipy>=1.1.0" sphinx numpydoc docutils + +ccache -M 512M +export CCACHE_COMPRESS=1 +export PATH=/usr/lib/ccache:$PATH + +pip install -e . + +make test diff --git a/conftest.py b/conftest.py index c2b9ae2038875..621097bfc47ab 100644 --- a/conftest.py +++ b/conftest.py @@ -5,6 +5,7 @@ # doc/modules/clustering.rst and use sklearn from the local folder rather than # the one from site-packages. +import platform from distutils.version import LooseVersion import pytest @@ -12,6 +13,15 @@ def pytest_collection_modifyitems(config, items): + + # FeatureHasher is not compatible with PyPy + if platform.python_implementation() == 'PyPy': + skip_marker = pytest.mark.skip( + reason='FeatureHasher is not compatible with PyPy') + for item in items: + if item.name == 'sklearn.feature_extraction.hashing.FeatureHasher': + item.add_marker(skip_marker) + # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to # run doctests only for numpy >= 1.14. skip_doctests = True diff --git a/doc/conftest.py b/doc/conftest.py index 11b190d8f66fd..7e229781cd32d 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -1,9 +1,11 @@ +import os from os.path import exists from os.path import join import warnings import numpy as np +from sklearn.utils import IS_PYPY from sklearn.utils.testing import SkipTest from sklearn.utils.testing import check_skip_network from sklearn.datasets import get_data_home @@ -56,6 +58,8 @@ def setup_twenty_newsgroups(): def setup_working_with_text_data(): + if IS_PYPY and os.environ.get('CI', None): + raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): @@ -98,6 +102,8 @@ def pytest_runtest_setup(item): setup_working_with_text_data() elif fname.endswith('modules/compose.rst') or is_index: setup_compose() + elif IS_PYPY and fname.endswith('modules/feature_extraction.rst'): + raise SkipTest('FeatureHasher is not compatible with PyPy') elif fname.endswith('modules/impute.rst'): setup_impute() elif fname.endswith('statistical_inference/unsupervised_learning.rst'): diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 19b6c2e455578..720c11ed98f4c 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -38,6 +38,12 @@ Scikit-learn requires: - NumPy (>= 1.8.2), - SciPy (>= 0.13.3). +.. note:: + + For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+ + are required. For PyPy, only installation instructions with pip apply. + + Building Scikit-learn also requires - Cython >=0.23 diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 1b43400f4a7a0..a27bae14ba250 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -352,7 +352,7 @@ and Cython optimizations. * Travis is used for testing on Linux platforms * Appveyor is used for testing on Windows platforms - * CircleCI is used to build the docs for viewing + * CircleCI is used to build the docs for viewing and for testing with PyPy on Linux Please note that if one of the following markers appear in the latest commit message, the following actions are taken. diff --git a/doc/faq.rst b/doc/faq.rst index 85ec39e45ba3f..bef75f58e1795 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -179,12 +179,10 @@ careful choice of algorithms. Do you support PyPy? -------------------- -In case you didn't know, `PyPy `_ is the new, fast, -just-in-time compiling Python implementation. We don't support it. -When the `NumPy support `_ -in PyPy is complete or near-complete, and SciPy is ported over as well, -we can start thinking of a port. -We use too much of NumPy to work with a partial implementation. +In case you didn't know, `PyPy `_ is an alternative +Python implementation with a built-in just-in-time compiler. Experimental +support for PyPy3-v5.10+ has been added, which requires Numpy 1.14.0+, +and scipy 1.1.0+. How do I deal with string data (or trees, graphs...)? ----------------------------------------------------- diff --git a/doc/install.rst b/doc/install.rst index 89c1aca455c7f..7dbb2287c4063 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -52,6 +52,12 @@ it as ``scikit-learn[alldeps]``. The most common use case for this is in a application or a Docker image. This option is not intended for manual installation from the command line. +.. note:: + + For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+ + are required. + + For installation instructions for more distributions see :ref:`other distributions `. For compiling the development version from source, or building the package diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index cc53041b91215..86c8d7a8ddab7 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -431,6 +431,12 @@ Miscellaneous :issue:`9101` by :user:`alex-33 ` and :user:`Maskani Filali Mohamed `. +- Add almost complete PyPy 3 support. Known unsupported functionalities are + :func:`datasets.load_svmlight_file`, :class:`feature_extraction.FeatureHasher` and + :class:`feature_extraction.text.HashingVectorizer`. For running on PyPy, PyPy3-v5.10+, + Numpy 1.14.0+, and scipy 1.1.0+ are required. + :issue:`11010` by :user:`Ronan Lamy ` and `Roman Yurchak`_. + Bug fixes ......... diff --git a/setup.py b/setup.py index 206cd645afecc..530ec899dc406 100755 --- a/setup.py +++ b/setup.py @@ -3,10 +3,10 @@ # Copyright (C) 2007-2009 Cournapeau David # 2010 Fabian Pedregosa # License: 3-clause BSD -descr = """A set of python modules for machine learning and data mining""" import sys import os +import platform import shutil from distutils.command.clean import clean as Clean from pkg_resources import parse_version @@ -41,8 +41,12 @@ VERSION = sklearn.__version__ -SCIPY_MIN_VERSION = '0.13.3' -NUMPY_MIN_VERSION = '1.8.2' +if platform.python_implementation() == 'PyPy': + SCIPY_MIN_VERSION = '1.1.0' + NUMPY_MIN_VERSION = '1.14.0' +else: + SCIPY_MIN_VERSION = '0.13.3' + NUMPY_MIN_VERSION = '1.8.2' # Optional setuptools features @@ -185,6 +189,10 @@ def setup_package(): 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + ('Programming Language :: Python :: ' + 'Implementation :: CPython'), + ('Programming Language :: Python :: ' + 'Implementation :: PyPy') ], cmdclass=cmdclass, install_requires=[ diff --git a/sklearn/datasets/setup.py b/sklearn/datasets/setup.py index a1def76c1bfce..3a8936bedffe8 100644 --- a/sklearn/datasets/setup.py +++ b/sklearn/datasets/setup.py @@ -1,6 +1,7 @@ import numpy import os +import platform def configuration(parent_package='', top_path=None): @@ -10,9 +11,10 @@ def configuration(parent_package='', top_path=None): config.add_data_dir('descr') config.add_data_dir('images') config.add_data_dir(os.path.join('tests', 'data')) - config.add_extension('_svmlight_format', - sources=['_svmlight_format.pyx'], - include_dirs=[numpy.get_include()]) + if platform.python_implementation() != 'PyPy': + config.add_extension('_svmlight_format', + sources=['_svmlight_format.pyx'], + include_dirs=[numpy.get_include()]) config.add_subpackage('tests') return config diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py index 357b257e542b8..42de5943b6d5d 100644 --- a/sklearn/datasets/svmlight_format.py +++ b/sklearn/datasets/svmlight_format.py @@ -22,12 +22,21 @@ import numpy as np import scipy.sparse as sp -from ._svmlight_format import _load_svmlight_file from .. import __version__ from ..externals import six from ..externals.six import u, b from ..externals.six.moves import range, zip -from ..utils import check_array +from ..utils import check_array, IS_PYPY + +if not IS_PYPY: + from ._svmlight_format import _load_svmlight_file +else: + def _load_svmlight_file(*args, **kwargs): + raise NotImplementedError( + 'load_svmlight_file is currently not ' + 'compatible with PyPy (see ' + 'https://github.com/scikit-learn/scikit-learn/issues/11543 ' + 'for the status updates).') def load_svmlight_file(f, n_features=None, dtype=np.float64, diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 3eab1d7c37eba..ca1f7ddae8ecd 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -18,6 +18,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_in +from sklearn.utils.testing import fails_if_pypy from sklearn.utils.fixes import sp_version import sklearn @@ -30,6 +31,8 @@ invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt") invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt") +pytestmark = fails_if_pypy + def test_load_svmlight_file(): X, y = load_svmlight_file(datafile) @@ -119,7 +122,8 @@ def test_load_compressed(): with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: - shutil.copyfileobj(f, gzip.open(tmp.name, "wb")) + with gzip.open(tmp.name, "wb") as fh_out: + shutil.copyfileobj(f, fh_out) Xgz, ygz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. @@ -130,7 +134,8 @@ def test_load_compressed(): with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: - shutil.copyfileobj(f, BZ2File(tmp.name, "wb")) + with BZ2File(tmp.name, "wb") as fh_out: + shutil.copyfileobj(f, fh_out) Xbz, ybz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index f32ac76990353..ad25a1965b266 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1336,12 +1336,14 @@ def _resize_state(self): raise ValueError('resize with smaller n_estimators %d < %d' % (total_n_estimators, self.estimators_[0])) - self.estimators_.resize((total_n_estimators, self.loss_.K)) - self.train_score_.resize(total_n_estimators) + self.estimators_ = np.resize(self.estimators_, + (total_n_estimators, self.loss_.K)) + self.train_score_ = np.resize(self.train_score_, total_n_estimators) if (self.subsample < 1 or hasattr(self, 'oob_improvement_')): # if do oob resize arrays or create new if not available if hasattr(self, 'oob_improvement_'): - self.oob_improvement_.resize(total_n_estimators) + self.oob_improvement_ = np.resize(self.oob_improvement_, + total_n_estimators) else: self.oob_improvement_ = np.zeros((total_n_estimators,), dtype=np.float64) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 9795d30aa6758..744a073090bad 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -7,9 +7,18 @@ import numpy as np import scipy.sparse as sp -from . import _hashing +from ..utils import IS_PYPY from ..base import BaseEstimator, TransformerMixin +if not IS_PYPY: + from ._hashing import transform as _hashing_transform +else: + def _hashing_transform(*args, **kwargs): + raise NotImplementedError( + 'FeatureHasher is not compatible with PyPy (see ' + 'https://github.com/scikit-learn/scikit-learn/issues/11540 ' + 'for the status updates).') + def _iteritems(d): """Like d.iteritems, but accepts any collections.Mapping.""" @@ -155,7 +164,7 @@ def transform(self, raw_X): elif self.input_type == "string": raw_X = (((f, 1) for f in x) for x in raw_X) indices, indptr, values = \ - _hashing.transform(raw_X, self.n_features, self.dtype, + _hashing_transform(raw_X, self.n_features, self.dtype, self.alternate_sign) n_samples = indptr.shape[0] - 1 diff --git a/sklearn/feature_extraction/setup.py b/sklearn/feature_extraction/setup.py index 7b71dfdcc83d7..761ff1ee5a7d3 100644 --- a/sklearn/feature_extraction/setup.py +++ b/sklearn/feature_extraction/setup.py @@ -1,4 +1,5 @@ import os +import platform def configuration(parent_package='', top_path=None): @@ -10,10 +11,11 @@ def configuration(parent_package='', top_path=None): if os.name == 'posix': libraries.append('m') - config.add_extension('_hashing', - sources=['_hashing.pyx'], - include_dirs=[numpy.get_include()], - libraries=libraries) + if platform.python_implementation() != 'PyPy': + config.add_extension('_hashing', + sources=['_hashing.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) config.add_subpackage("tests") return config diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 6f0d6b0214953..77a21ff4364a7 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -5,7 +5,9 @@ from sklearn.feature_extraction import FeatureHasher from sklearn.utils.testing import (assert_raises, assert_true, assert_equal, - ignore_warnings) + ignore_warnings, fails_if_pypy) + +pytestmark = fails_if_pypy def test_feature_hasher_dicts(): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 493a0dabffde7..b9431bc5439cb 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -27,13 +27,14 @@ import numpy as np from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal +from sklearn.utils import IS_PYPY from sklearn.utils.testing import (assert_equal, assert_false, assert_true, assert_not_equal, assert_almost_equal, assert_in, assert_less, assert_greater, assert_warns_message, assert_raise_message, clean_warning_registry, ignore_warnings, SkipTest, assert_raises, assert_no_warnings, - assert_allclose_dense_sparse) + fails_if_pypy, assert_allclose_dense_sparse) from sklearn.utils.fixes import _Mapping as Mapping from collections import defaultdict from functools import partial @@ -503,6 +504,7 @@ def test_tfidf_vectorizer_setters(): assert_true(tv._tfidf.sublinear_tf) +@fails_if_pypy @ignore_warnings(category=DeprecationWarning) def test_hashing_vectorizer(): v = HashingVectorizer() @@ -685,6 +687,7 @@ def test_count_binary_occurrences(): assert_equal(X_sparse.dtype, np.float32) +@fails_if_pypy @ignore_warnings(category=DeprecationWarning) def test_hashed_binary_occurrences(): # by default multiple occurrences are counted as longs @@ -824,6 +827,7 @@ def test_vectorizer_pipeline_cross_validation(): assert_array_equal(cv_scores, [1., 1., 1.]) +@fails_if_pypy @ignore_warnings(category=DeprecationWarning) def test_vectorizer_unicode(): # tests that the count vectorizer works with cyrillic. @@ -891,9 +895,12 @@ def test_pickling_vectorizer(): copy = pickle.loads(s) assert_equal(type(copy), orig.__class__) assert_equal(copy.get_params(), orig.get_params()) - assert_array_equal( - copy.fit_transform(JUNK_FOOD_DOCS).toarray(), - orig.fit_transform(JUNK_FOOD_DOCS).toarray()) + if IS_PYPY and isinstance(orig, HashingVectorizer): + continue + else: + assert_array_equal( + copy.fit_transform(JUNK_FOOD_DOCS).toarray(), + orig.fit_transform(JUNK_FOOD_DOCS).toarray()) def test_countvectorizer_vocab_sets_when_pickling(): @@ -995,6 +1002,7 @@ def test_non_unique_vocab(): assert_raises(ValueError, vect.fit, []) +@fails_if_pypy def test_hashingvectorizer_nan_in_docs(): # np.nan can appear when using pandas to load text fields from a csv file # with missing values. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e3c72d6884591..2b17f41010eeb 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1771,6 +1771,9 @@ class KernelCenterer(BaseEstimator, TransformerMixin): Read more in the :ref:`User Guide `. """ + def __init__(self): + # Needed for backported inspect.signature compatibility with PyPy + pass def fit(self, K, y=None): """Fit KernelCenterer diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 2cde1ca9a14af..8e5f020985b19 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -28,6 +28,7 @@ from sklearn.cluster.bicluster import BiclusterMixin from sklearn.linear_model.base import LinearClassifierMixin +from sklearn.utils import IS_PYPY from sklearn.utils.estimator_checks import ( _yield_all_checks, set_checking_parameters, @@ -163,6 +164,9 @@ def test_import_all_consistency(): for modname in submods + ['sklearn']: if ".tests." in modname: continue + if IS_PYPY and ('_svmlight_format' in modname or + 'feature_extraction._hashing' in modname): + continue package = __import__(modname, fromlist="dummy") for name in getattr(package, '__all__', ()): if getattr(package, name, None) is None: diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 6bbc4651a8d51..df139743d7c08 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -12,6 +12,7 @@ import sklearn from sklearn.base import signature +from sklearn.utils import IS_PYPY from sklearn.utils.testing import SkipTest from sklearn.utils.testing import check_docstring_parameters from sklearn.utils.testing import _get_func_name @@ -143,6 +144,11 @@ def test_tabs(): # Test that there are no tabs in our source files for importer, modname, ispkg in walk_packages(sklearn.__path__, prefix='sklearn.'): + + if IS_PYPY and ('_svmlight_format' in modname or + 'feature_extraction._hashing' in modname): + continue + # because we don't import mod = importlib.import_module(modname) try: diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 56177fca7a693..84f7ceae5dd5d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1,8 +1,8 @@ """ The :mod:`sklearn.utils` module includes various utilities. """ - import numbers +import platform import numpy as np from scipy.sparse import issparse @@ -32,6 +32,8 @@ "cpu_count", "Parallel", "Memory", "delayed", "parallel_backend", "hash"] +IS_PYPY = platform.python_implementation() == 'PyPy' + class Bunch(dict): """Container object for datasets diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 930654ee3ca97..5149900c9c473 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -14,6 +14,7 @@ from scipy.stats import rankdata from sklearn.externals.six.moves import zip +from sklearn.utils import IS_PYPY from sklearn.utils._joblib import hash, Memory from sklearn.utils.testing import assert_raises, _get_args from sklearn.utils.testing import assert_raises_regex @@ -1979,6 +1980,11 @@ def check_no_attributes_set_in_init(name, estimator): return init_params = _get_args(type(estimator).__init__) + if IS_PYPY: + # __init__ signature has additional objects in PyPy + for key in ['obj']: + if key in init_params: + init_params.remove(key) parents_init_params = [param for params_parent in (_get_args(parent) for parent in type(estimator).__mro__) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index bfae5d4662b1c..ff91aa9624176 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -47,7 +47,7 @@ from sklearn.base import BaseEstimator from sklearn.externals import joblib from sklearn.utils.fixes import signature -from sklearn.utils import deprecated +from sklearn.utils import deprecated, IS_PYPY additional_names_in_all = [] @@ -625,7 +625,10 @@ def is_abstract(c): path = sklearn.__path__ for importer, modname, ispkg in pkgutil.walk_packages( path=path, prefix='sklearn.', onerror=lambda x: None): - if (".tests." in modname): + if ".tests." in modname: + continue + if IS_PYPY and ('_svmlight_format' in modname or + 'feature_extraction._hashing' in modname): continue module = __import__(modname, fromlist="dummy") classes = inspect.getmembers(module, inspect.isclass) @@ -706,6 +709,8 @@ def run_test(*args, **kwargs): reason='skipped on 32bit platforms') skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', reason='skip on travis') + fails_if_pypy = pytest.mark.xfail(IS_PYPY, raises=NotImplementedError, + reason='not compatible with PyPy') # Decorator for tests involving both BLAS calls and multiprocessing. #