Use hypothesis (#5759)

* Use hypothesis * Allow int64 array interface for groups * Add packages to Windows CI * Add to travis * Make sure device index is set correctly * Fix dask-cudf test * appveyor
dmlc · Jun 16, 2020 · b47b5ac · b47b5ac
1 parent 02884b0
commit b47b5ac
Show file tree

Hide file tree

Showing 17 changed files with 414 additions and 442 deletions.
diff --git a/Jenkinsfile-win64 b/Jenkinsfile-win64
@@ -113,7 +113,7 @@ def TestWin64CPU() {
     """
     echo "Installing Python dependencies..."
     bat """
-     conda activate && conda upgrade scikit-learn pandas numpy
+     conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
     """
     echo "Running Python tests..."
     bat "conda activate && python -m pytest -v -s --fulltrace tests\\python"
@@ -138,7 +138,7 @@ def TestWin64GPU(args) {
     """
     echo "Installing Python dependencies..."
     bat """
-     conda activate && conda upgrade scikit-learn pandas numpy
+     conda activate && conda install -y hypothesis && conda upgrade scikit-learn pandas numpy hypothesis
     """
     echo "Running Python tests..."
     bat """

diff --git a/appveyor.yml b/appveyor.yml
@@ -44,7 +44,7 @@ install:
     - if /i "%DO_PYTHON%" == "on" (
         conda config --set always_yes true &&
         conda update -q conda &&
-        conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz
+        conda install -y numpy scipy pandas matplotlib pytest scikit-learn graphviz python-graphviz hypothesis
       )
     - set PATH=C:\Miniconda3-x64\Library\bin\graphviz;%PATH%
     # R: based on https://github.com/krlmlr/r-appveyor

diff --git a/src/data/data.cu b/src/data/data.cu
@@ -34,6 +34,30 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
   });
 }
 
+void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
+  CHECK(column.type[1] == 'i' || column.type[1] == 'u')
+      << "Expected integer metainfo";
+  auto SetDeviceToPtr = [](void* ptr) {
+    cudaPointerAttributes attr;
+    dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
+    int32_t ptr_device = attr.device;
+    dh::safe_cuda(cudaSetDevice(ptr_device));
+    return ptr_device;
+  };
+  auto ptr_device = SetDeviceToPtr(column.data);
+  dh::TemporaryArray<bst_group_t> temp(column.num_rows);
+  auto d_tmp = temp.data();
+
+  dh::LaunchN(ptr_device, column.num_rows, [=] __device__(size_t idx) {
+    d_tmp[idx] = column.GetElement(idx);
+  });
+  auto length = column.num_rows;
+  out->resize(length + 1);
+  out->at(0) = 0;
+  thrust::copy(temp.data(), temp.data() + length, out->begin() + 1);
+  std::partial_sum(out->begin(), out->end(), out->begin());
+}
+
 void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
   auto const& j_arr = get<Array>(j_interface);
@@ -53,16 +77,7 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   } else if (key == "base_margin") {
     CopyInfoImpl(array_interface, &base_margin_);
   } else if (key == "group") {
-    // Ranking is not performed on device.
-    thrust::device_ptr<uint32_t> p_src{
-        reinterpret_cast<uint32_t*>(array_interface.data)};
-
-    auto length = array_interface.num_rows;
-    group_ptr_.resize(length + 1);
-    group_ptr_[0] = 0;
-    thrust::copy(p_src, p_src + length, group_ptr_.begin() + 1);
-    std::partial_sum(group_ptr_.begin(), group_ptr_.end(), group_ptr_.begin());
-
+    CopyGroupInfoImpl(array_interface, &group_ptr_);
     return;
   } else {
     LOG(FATAL) << "Unknown metainfo: " << key;

diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV GOSU_VERSION 1.10
 # Install Python packages in default env
 RUN \
     pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \
-    		recommonmark guzzle_sphinx_theme mock breathe graphviz \
+    		recommonmark guzzle_sphinx_theme mock breathe graphviz hypothesis\
 		pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \
     pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
     pip install "dask[complete]"

diff --git a/tests/ci_build/Dockerfile.cudf b/tests/ci_build/Dockerfile.cudf
@@ -19,7 +19,7 @@ ENV PATH=/opt/python/bin:$PATH
 RUN \
     conda create -n cudf_test -c rapidsai -c nvidia -c conda-forge -c defaults \
         python=3.7 cudf cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
-        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz
+        numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
 
 ENV GOSU_VERSION 1.10
 

diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
@@ -18,7 +18,7 @@ ENV PATH=/opt/python/bin:$PATH
 RUN \
     conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
         python=3.7 dask dask-cuda numpy pytest scipy  scikit-learn pandas \
-        matplotlib wheel python-kubernetes urllib3 graphviz
+        matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
 
 ENV GOSU_VERSION 1.10
 

diff --git a/tests/cpp/data/test_metainfo.cu b/tests/cpp/data/test_metainfo.cu
@@ -21,7 +21,7 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, cons
 
   std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
   column["shape"] = Array(j_shape);
-  column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(4)))});
+  column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
   column["version"] = Integer(static_cast<Integer::Int>(1));
   column["typestr"] = String(typestr);
 
@@ -78,16 +78,32 @@ TEST(MetaInfo, FromInterface) {
 
 TEST(MetaInfo, Group) {
   cudaSetDevice(0);
-  thrust::device_vector<uint32_t> d_data;
-  std::string str = PrepareData<uint32_t>("<u4", &d_data);
 
   MetaInfo info;
 
-  info.SetInfo("group", str.c_str());
-  auto const& h_group = info.group_ptr_;
-  ASSERT_EQ(h_group.size(), d_data.size() + 1);
+  thrust::device_vector<uint32_t> d_uint;
+  std::string uint_str = PrepareData<uint32_t>("<u4", &d_uint);
+  info.SetInfo("group", uint_str.c_str());
+  auto& h_group = info.group_ptr_;
+  ASSERT_EQ(h_group.size(), d_uint.size() + 1);
   for (size_t i = 1; i < h_group.size(); ++i) {
-    ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i;
+    ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
   }
+
+  thrust::device_vector<int64_t> d_int64;
+  std::string int_str = PrepareData<int64_t>("<i8", &d_int64);
+  info = MetaInfo();
+  info.SetInfo("group", int_str.c_str());
+  h_group = info.group_ptr_;
+  ASSERT_EQ(h_group.size(), d_uint.size() + 1);
+  for (size_t i = 1; i < h_group.size(); ++i) {
+    ASSERT_EQ(h_group[i], d_uint[i - 1] + h_group[i - 1]) << "i: " << i;
+  }
+
+  // Incorrect type
+  thrust::device_vector<float> d_float;
+  std::string float_str = PrepareData<float>("<f4", &d_float);
+  info = MetaInfo();
+  EXPECT_ANY_THROW(info.SetInfo("group", float_str.c_str()));
 }
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
@@ -1,30 +1,50 @@
 import sys
-import pytest
-import unittest
+from hypothesis import strategies, given, settings, assume
+import xgboost as xgb
+sys.path.append("tests/python")
+import testing as tm
 
-sys.path.append('tests/python/')
-import test_linear  # noqa: E402
-import testing as tm  # noqa: E402
 
+parameter_strategy = strategies.fixed_dictionaries({
+    'booster': strategies.just('gblinear'),
+    'eta': strategies.floats(0.01, 0.25),
+    'tolerance': strategies.floats(1e-5, 1e-2),
+    'nthread': strategies.integers(1, 4),
+    'feature_selector': strategies.sampled_from(['cyclic', 'shuffle',
+                                                 'greedy', 'thrifty']),
+    'top_k': strategies.integers(1, 10),
+})
 
-class TestGPULinear(unittest.TestCase):
-    datasets = ["Boston", "Digits", "Cancer", "Sparse regression"]
-    common_param = {
-        'booster': ['gblinear'],
-        'updater': ['gpu_coord_descent'],
-        'eta': [0.5],
-        'top_k': [10],
-        'tolerance': [1e-5],
-        'alpha': [.1],
-        'lambda': [0.005],
-        'coordinate_selection': ['cyclic', 'random', 'greedy']}
+def train_result(param, dmat, num_rounds):
+    result = {}
+    xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
+              evals_result=result)
+    return result
 
-    @pytest.mark.skipif(**tm.no_sklearn())
-    def test_gpu_coordinate(self):
-        parameters = self.common_param.copy()
-        parameters['gpu_id'] = [0]
-        for param in test_linear.parameter_combinations(parameters):
-            results = test_linear.run_suite(
-                param, 100, self.datasets, scale_features=True)
-            test_linear.assert_regression_result(results, 1e-2)
-            test_linear.assert_classification_result(results)
+
+class TestGPULinear:
+    @given(parameter_strategy, strategies.integers(10, 50),
+           tm.dataset_strategy)
+    @settings(deadline=None)
+    def test_gpu_coordinate(self, param, num_rounds, dataset):
+        assume(len(dataset.y) > 0)
+        param['updater'] = 'gpu_coord_descent'
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
+        assert tm.non_increasing(result)
+
+    # Loss is not guaranteed to always decrease because of regularisation parameters
+    # We test a weaker condition that the loss has not increased between the first and last
+    # iteration
+    @given(parameter_strategy, strategies.integers(10, 50),
+           tm.dataset_strategy, strategies.floats(1e-5, 2.0),
+           strategies.floats(1e-5, 2.0))
+    @settings(deadline=None)
+    def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
+        assume(len(dataset.y) > 0)
+        param['updater'] = 'gpu_coord_descent'
+        param['alpha'] = alpha
+        param['lambda'] = lambd
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric]
+        assert tm.non_increasing([result[0], result[-1]])
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
@@ -4,9 +4,13 @@
 import numpy as np
 import subprocess
 import os
+import sys
 import json
 import pytest
 
+sys.path.append("tests/python")
+import testing as tm
+
 import xgboost as xgb
 from xgboost import XGBClassifier
 
@@ -90,7 +94,6 @@ def test_wrap_gpu_id(self):
         )
         status = subprocess.call(args, env=env)
         assert status == 0
-
         os.remove(model_path)
 
     def test_pickled_predictor(self):

diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
@@ -158,10 +158,10 @@ def test_inplace_predict_cudf(self):
         rows = 1000
         cols = 10
         rng = np.random.RandomState(1994)
+        cp.cuda.runtime.setDevice(0)
         X = rng.randn(rows, cols)
         X = pd.DataFrame(X)
         y = rng.randn(rows)
-
         X = cudf.from_pandas(X)
 
         dtrain = xgb.DMatrix(X, y)